fixup: create piper libdir also when not built

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Merge branch 'master' into enable_gpu
2026-02-08 21:53:19 -05:00 · 2023-11-12 22:17:11 +01:00 · 2023-11-11 19:20:36 +01:00 · 2023-11-11 18:40:48 +01:00 · 2023-11-11 18:40:26 +01:00 · 2023-11-11 13:14:59 +01:00
251 changed files with 35198 additions and 1355 deletions
--- a/.env
+++ b/.env
@@ -23,6 +23,12 @@ MODELS_PATH=/models
 ## Enable debug mode
 # DEBUG=true
 ## Disables COMPEL (Diffusers)
 # COMPEL=0
 ## Enable/Disable single backend (useful if only one GPU is available)
 # SINGLE_ACTIVE_BACKEND=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -44,3 +50,23 @@ MODELS_PATH=/models
 ## Specify a default upload limit in MB (whisper)
 # UPLOAD_LIMIT
 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
 # EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
 ##
 ### Preload libraries
 # LD_PRELOAD=
 ### Huggingface cache for models
 # HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
 ### Python backends GRPC max workers
 ### Default number of workers for GRPC Python backends.
 ### This actually controls wether a backend can process multiple requests or not.
 # PYTHON_GRPC_MAX_WORKERS=1
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,16 +8,24 @@ This PR fixes #
 **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
 - [ ] Yes, I signed my commits.
 <!--
 Thank you for contributing to LocalAI! 
-Contributing Conventions:
+Contributing Conventions
 -------------------------
-1. Include descriptive PR titles with [<component-name>] prepended.
+The draft above helps to give a quick overview of your PR.
-2. Build and test your changes before submitting a PR. 
+
 Remember to remove this comment and to at least:
 1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
 2. Build and test your changes before submitting a PR (`make build`). 
 3. Sign your commits
 4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
 5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!
 By following the community's contribution conventions upfront, the review process will 
 be accelerated and your PR merged more quickly.
 If no one reviews your PR within a few days, please @-mention @mudler.
 -->
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -41,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -14,15 +14,21 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  docker:
+  image-build:
    strategy:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -37,11 +43,6 @@ jobs:
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -57,46 +58,57 @@ jobs:
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
-    runs-on: ubuntu-latest
+    runs-on: arc-runner-set 
    steps:
-      - name: Release space from worker
+      - name: Force Install GIT latest
        run: |
-          echo "Listing top largest packages"
+          sudo apt-get update \
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          && sudo apt-get install -y software-properties-common \
-          head -n 30 <<< "${pkgs}"
+          && sudo apt-get update \
-          echo
+          && sudo add-apt-repository -y ppa:git-core/ppa \
-          df -h
+          && sudo apt-get update \
-          echo
+          && sudo apt-get install -y git
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          df -h
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
-
+      # - name: Release space from worker
      #   run: |
      #     echo "Listing top largest packages"
      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
      #     head -n 30 <<< "${pkgs}"
      #     echo
      #     df -h
      #     echo
      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
      #     sudo rm -rf /usr/local/lib/android
      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
      #     sudo rm -rf /usr/share/dotnet
      #     sudo apt-get remove -y '^mono-.*' || true
      #     sudo apt-get remove -y '^ghc-.*' || true
      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
      #     sudo apt-get remove -y 'php.*' || true
      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
      #     sudo apt-get remove -y '^google-.*' || true
      #     sudo apt-get remove -y azure-cli || true
      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
      #     sudo apt-get remove -y '^gfortran-.*' || true
      #     sudo apt-get remove -y microsoft-edge-stable || true
      #     sudo apt-get remove -y firefox || true
      #     sudo apt-get remove -y powershell || true
      #     sudo apt-get remove -y r-base-core || true
      #     sudo apt-get autoremove -y
      #     sudo apt-get clean
      #     echo
      #     echo "Listing top largest packages"
      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
      #     head -n 30 <<< "${pkgs}"
      #     echo
      #     sudo rm -rfv build || true
      #     df -h
      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
        with:
          images: quay.io/go-skynet/local-ai
          tags: |
@@ -118,14 +130,14 @@ jobs:
      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: quay.io
          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
      - name: Build and push
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -19,13 +19,22 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && sudo make -j12 install
      - name: Build
        id: build
        env:
@@ -57,15 +66,26 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -0,0 +1,63 @@
 ---
 name: 'GPU tests'
 on:
  pull_request:
  push:
    branches:
      - master
    tags:
      - '*'
 concurrency:
  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
 jobs:
  ubuntu-latest:
    runs-on: gpu
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
      - name: Build
        run: |
          if [ ! -e /run/systemd/system ]; then
            sudo mkdir /run/systemd/system
          fi
          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            BUILD_TYPE=cublas \
            prepare-e2e run-e2e-image test-e2e
      - name: Release space from worker ♻
        if: always()
        run: |
          sudo rm -rf build || true
          sudo rm -rf bin || true
          sudo rm -rf dist || true
          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
          sudo cat logs.txt || true
          sudo rm -rf logs.txt
          make clean || true
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            teardown-e2e || true
          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,51 +14,113 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  ubuntu-latest:
+  tests-linux:
    runs-on: ubuntu-latest
-
+    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Release space from worker
        run: |
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          sudo pip install -r extra/requirements.txt
-          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
+          sudo rm -rfv /usr/bin/conda || true
-          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
+          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface
-          tar -xzvf - && \
+
-          mkdir -p "spdlog-1.11.0/build" && \
+          # Pre-build piper before we start tests in order to have shared libraries in place
-          cd "spdlog-1.11.0/build" && \
+          make go-piper && \
-          cmake ..  && \
+          GO_TAGS="tts" make -C go-piper piper.o && \
-          make -j8 && \
+          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
-          sudo cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
+
-          cd /build && \
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
+          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
+
-          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
+                -DgRPC_BUILD_TESTS=OFF \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+                ../.. && sudo make -j12 install
      - name: Test
        run: |
-          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
+          GO_TAGS="stablediffusion tts" make test
-  macOS-latest:
+  tests-apple:
    runs-on: macOS-latest
-
+    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
-
+      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # go-llama build artifacts
 go-llama
 go-llama-stable
 /gpt4all
 go-stable-diffusion
 go-piper
@@ -9,6 +10,8 @@ go-ggllm
 __pycache__/
 *.a
 get-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp
 go-ggml-transformers
 go-gpt2
@@ -22,6 +25,8 @@ LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
 # prevent above rules from omitting the api/localai folder
 !api/localai
 # Ignore models
 models/*
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,72 @@
 # Contributing to localAI
 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
 ## Table of Contents
 - [Getting Started](#getting-started)
  - [Prerequisites](#prerequisites)
  - [Setting up the Development Environment](#setting-up-the-development-environment)
 - [Contributing](#contributing)
  - [Submitting an Issue](#submitting-an-issue)
  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
 - [Coding Guidelines](#coding-guidelines)
 - [Testing](#testing)
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
 - Golang [1.21]
 - Git
 - macOS/Linux
 ### Setting up the Development Environment and running localAI in the local environment
 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
 3. Install the required dependencies: `make prepare`
 4. Run LocalAI: `make run`
 ## Contributing
 We welcome contributions from everyone! To get started, follow these steps:
 ### Submitting an Issue
 If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
 ### Creating a Pull Request (PR)
 1. Fork the repository.
 2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
 3. Make your changes and commit them.
 4. Push the changes to your fork: `git push origin [branch name]`
 5. Create a new pull request from your branch to the main project's `main` or `master` branch.
 6. Provide a clear description of your changes in the pull request.
 7. Make any requested changes during the review process.
 8. Once your PR is approved, it will be merged into the main project.
 ## Coding Guidelines
 - No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
 ## Documentation
 - We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
 ## Community and Communication
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/162
+++ b/162
@@ -1,22 +1,27 @@
-ARG GO_VERSION=1.20-bullseye
+ARG GO_VERSION=1.21-bullseye
 ARG IMAGE_TYPE=extras
 # extras or core
-FROM golang:$GO_VERSION as requirements
+
 FROM golang:$GO_VERSION as requirements-core
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
 ARG SPDLOG_VERSION="1.11.0"
 ARG PIPER_PHONEMIZE_VERSION='1.0.0'
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
 ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"
 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@@ -30,63 +35,62 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get install -y \
    libopenblas-dev \
    libopencv-dev \ 
    && apt-get clean
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 # Extras requirements
 FROM requirements-core as requirements-extras
 RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
    apt-get install -y conda
 COPY extra/requirements.txt /build/extra/requirements.txt
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
+#    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
+ #   fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
-WORKDIR /build
+# Vall-e-X
 RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
 # OpenBLAS requirements
 RUN apt-get install -y libopenblas-dev
 # Stable Diffusion requirements
 RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
 #RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \
    tar -xzvf - && \
    mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \
    cd "spdlog-${SPDLOG_VERSION}/build" && \
    cmake ..  && \
    make -j8 && \
    cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
    cd /build && \
    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
 # \
 #    ; fi
 ###################################
 ###################################
-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder
 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
 ARG BUILD_GRPC=true
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@@ -101,21 +105,43 @@ RUN make prepare
 COPY . .
 COPY .git .
-RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
+# stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
 RUN if [ "${BUILD_GRPC}" = "true" ]; then \
    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
       ../.. && make -j12 install && rm -rf grpc \
    ; fi
 # Rebuild with defaults backends
 RUN make build
 RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
    mkdir -p /build/go-piper/piper/build/pi/lib/ \
    touch /build/go-piper/piper/build/pi/lib/keep \
    ; fi
 ###################################
 ###################################
-FROM requirements
+FROM requirements-${IMAGE_TYPE}
 ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
 ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
@@ -129,11 +155,49 @@ WORKDIR /build
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
 # Copy the binary
 COPY --from=builder /build/local-ai ./
-# To resolve exllama import error
+
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+# Copy shared libraries for piper
 COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
    ; fi
 # Copy VALLE-X as it's not a real "lib"
 RUN if [ -d /usr/lib/vall-e-x ]; then \
    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
    fi
 # we also copy exllama libs over to resolve exllama import error
 RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/216
+++ b/216
@@ -4,11 +4,15 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 # llama.cpp versions
-GOLLAMA_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
 CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
-GPT4ALL_VERSION?=0f2bb506a8ee752afc06cbb832773bf85b97eef3
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
 # go-ggml-transformers version
 GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
@@ -24,23 +28,23 @@ WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
-PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7
+PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78
 # go-bloomz version
 BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
 # Go-ggllm
 GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git
 TEST_DIR=/tmp/test
 RANDOM := $(shell bash -c 'echo $$RANDOM')
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@@ -48,7 +52,6 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION
 override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
 OPTIONAL_TARGETS?=
 ESPEAK_DATA?=
 OS := $(shell uname -s)
 ARCH := $(shell uname -m)
@@ -58,13 +61,19 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)
 # Default Docker bridge IP
 E2E_BRIDGE_IP?=172.17.0.1
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
 # workaround for rwkv.cpp
 ifeq ($(UNAME_S),Darwin)
-        CGO_LDFLAGS += -lcblas -framework Accelerate 
+	CGO_LDFLAGS += -lcblas -framework Accelerate
 ifneq ($(BUILD_TYPE),metal)
    # explicit disable metal if on Darwin and metal is disabled
 	CMAKE_ARGS+=-DLLAMA_METAL=OFF
 endif
 endif
 ifeq ($(BUILD_TYPE),openblas)
@@ -76,6 +85,18 @@ ifeq ($(BUILD_TYPE),cublas)
 	export LLAMA_CUBLAS=1
 endif
 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# Llama-stable has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
 endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
@@ -98,9 +119,19 @@ endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
 	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 # If empty, then we build all
 ifeq ($(GRPC_BACKENDS),)
 	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
 endif
 .PHONY: all test build vendor
 all: help
@@ -110,14 +141,6 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
 ## go-ggllm
 go-ggllm:
 	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
 	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
 go-ggllm/libggllm.a: go-ggllm
 	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -144,14 +167,6 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 ## bloomz
 bloomz:
 	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
 	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
 bloomz/libbloomz.a: bloomz
 	cd bloomz && make libbloomz.a
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a
@@ -161,14 +176,10 @@ backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
-backend-assets/espeak-ng-data:
+backend-assets/espeak-ng-data: go-piper
 	mkdir -p backend-assets/espeak-ng-data
-ifdef ESPEAK_DATA
+	$(MAKE) -C go-piper piper.o
-	@cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data
+	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
 else
 	@echo "ESPEAK_DATA not set, skipping tts. Note that this will break the tts functionality."
 	@touch backend-assets/espeak-ng-data/keep
 endif
 gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
@@ -192,26 +203,30 @@ go-llama:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
 go-llama-stable:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
 	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
 go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
-go-piper/libpiper_binding.a:
+go-llama-stable/libbinding.a: go-llama-stable
 	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main
-get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@
 replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
 	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -220,15 +235,14 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C go-llama clean
 	$(MAKE) -C go-llama-stable clean
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C go-ggml-transformers clean
 	$(MAKE) -C go-rwkv clean
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
 	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
 	$(MAKE) -C go-ggllm clean
 	$(MAKE) build
 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -239,18 +253,21 @@ clean: ## Remove build related file
 	rm -f prepare
 	rm -rf ./go-llama
 	rm -rf ./gpt4all
 	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
 	rm -rf ./go-ggml-transformers
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
 	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
 	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	rm -rf ./backend/cpp/grpc/grpc_repo
 	rm -rf ./backend/cpp/grpc/build
 	rm -rf ./backend/cpp/grpc/installed_packages
 	$(MAKE) -C backend/cpp/llama clean
 ## Build:
@@ -273,12 +290,12 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
-	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
+	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
-	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
-	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
-	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models
 prepare-test: grpcs
@@ -289,13 +306,34 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
 prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
 	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
@@ -304,6 +342,10 @@ test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
@@ -340,17 +382,26 @@ protogen-python:
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments:
 	$(MAKE) -C extra/grpc/autogptq
 	$(MAKE) -C extra/grpc/bark
 	$(MAKE) -C extra/grpc/diffusers
 	$(MAKE) -C extra/grpc/vllm
 	$(MAKE) -C extra/grpc/huggingface
 	$(MAKE) -C extra/grpc/vall-e-x
 	$(MAKE) -C extra/grpc/exllama
 backend-assets/grpc:
 	mkdir -p backend-assets/grpc
 backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
 # TODO: every binary should have its own folder instead, so can have different metal implementations
@@ -358,6 +409,42 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif
 ## BACKEND CPP LLAMA START
 # Sets the variables in case it has to build the gRPC locally.
 INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
 backend/cpp/llama/grpc-server:
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	backend/cpp/grpc/script/build_grpc.sh ${INSTALLED_PACKAGES}
 	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
 	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
 	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
 	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
 endif
 ## BACKEND CPP LLAMA END
 ##
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif
 backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
 backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
@@ -398,10 +485,6 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
 backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
@@ -409,16 +492,19 @@ backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
+backend-assets/grpc/stablediffusion: backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
 		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
 		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
 	fi
 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
 backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
-grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
 <p align="center"><b>Follow LocalAI </b></p>
@@ -104,15 +104,30 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
-### 💡 Example: Use GPT4ALL-J model
+### 💡 Example: Use Luna-AI Llama model
-See the [documentation](https://localai.io/basics/getting_started/#example-use-gpt4all-j-model-with-docker-compose)
+See the [documentation](https://localai.io/basics/getting_started)
 ### 🔗 Resources
 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/integrations/)
 - [How tos section](https://localai.io/howtos/) (curated by our community)
 ## Citation
 If you utilize this repository, data in a downstream project, please consider citing it with:
 ```
@misc{localai,
  author = {Ettore Di Giacinto},
  title = {LocalAI: The free, Open source OpenAI alternative},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
 ```
 ## ❤️ Sponsors
@@ -127,6 +142,11 @@ A huge thank you to our generous sponsors who support this project:
 |  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |
 And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
 - [Sponsor list](https://github.com/sponsors/mudler)
 - JDAM00 (donating HW for the CI)
 ## 🌟 Star history
 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
--- a/api/api.go
+++ b/api/api.go
@@ -2,13 +2,16 @@ package api
 import (
 	"errors"
 	"fmt"
 	"strings"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/localai"
 	"github.com/go-skynet/LocalAI/api/openai"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"
 	"github.com/gofiber/fiber/v2"
@@ -19,7 +22,7 @@ import (
 	"github.com/rs/zerolog/log"
 )
-func App(opts ...options.AppOption) (*fiber.App, error) {
+func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
 	options := options.NewOptions(opts...)
 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
@@ -27,6 +30,65 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
 	cl := config.NewConfigLoader()
 	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
 		log.Error().Msgf("error loading config files: %s", err.Error())
 	}
 	if options.ConfigFile != "" {
 		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
 			log.Error().Msgf("error loading config file: %s", err.Error())
 		}
 	}
 	if options.Debug {
 		for _, v := range cl.ListConfigs() {
 			cfg, _ := cl.GetConfig(v)
 			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
 		}
 	}
 	if options.AssetsDestination != "" {
 		// Extract files from the embedded FS
 		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
 		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
 		if err != nil {
 			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
 		}
 	}
 	if options.PreloadJSONModels != "" {
 		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
 			return nil, nil, err
 		}
 	}
 	if options.PreloadModelsFromPath != "" {
 		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
 			return nil, nil, err
 		}
 	}
 	// turn off any process that was started by GRPC if the context is canceled
 	go func() {
 		<-options.Context.Done()
 		log.Debug().Msgf("Context canceled, shutting down")
 		options.Loader.StopAllGRPC()
 	}()
 	return options, cl, nil
 }
 func App(opts ...options.AppOption) (*fiber.App, error) {
 	options, cl, err := Startup(opts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
 		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
@@ -44,8 +106,8 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 			// Send custom error page
 			return ctx.Status(code).JSON(
-				openai.ErrorResponse{
+				schema.ErrorResponse{
-					Error: &openai.APIError{Message: err.Error(), Code: code},
+					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
 		},
@@ -57,38 +119,11 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}))
 	}
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
 	cm := config.NewConfigLoader()
 	if err := cm.LoadConfigs(options.Loader.ModelPath); err != nil {
 		log.Error().Msgf("error loading config files: %s", err.Error())
 	}
 	if options.ConfigFile != "" {
 		if err := cm.LoadConfigFile(options.ConfigFile); err != nil {
 			log.Error().Msgf("error loading config file: %s", err.Error())
 		}
 	}
 	if options.Debug {
 		for _, v := range cm.ListConfigs() {
 			cfg, _ := cm.GetConfig(v)
 			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
 		}
 	}
 	if options.AssetsDestination != "" {
 		// Extract files from the embedded FS
 		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
 		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
 		if err != nil {
 			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
 		}
 	}
 	// Default middleware config
 	app.Use(recover.New())
 	if options.Metrics != nil {
 		app.Use(metrics.APIMiddleware(options.Metrics))
 	}
 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
@@ -116,18 +151,6 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		return c.Next()
 	}
 	if options.PreloadJSONModels != "" {
 		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cm, options.Galleries); err != nil {
 			return nil, err
 		}
 	}
 	if options.PreloadModelsFromPath != "" {
 		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cm, options.Galleries); err != nil {
 			return nil, err
 		}
 	}
 	if options.CORS {
 		var c func(ctx *fiber.Ctx) error
 		if options.CORSAllowOrigins == "" {
@@ -141,7 +164,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	// LocalAI API endpoints
 	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
-	galleryService.Start(options.Context, cm)
+	galleryService.Start(options.Context, cl)
 	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
@@ -149,36 +172,41 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})
-	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cm, galleryService.C, options.Galleries))
+	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
-	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
+	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
-	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))
+	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
 	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
 	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
 	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
 	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
 	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())
 	// openAI compatible API endpoint
 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cm, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cm, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))
 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cm, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
-	app.Post("/edits", auth, openai.EditEndpoint(cm, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, options))
 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))
 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cm, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
-	app.Post("/tts", auth, localai.TTSEndpoint(cm, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))
 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cm, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))
 	if options.ImageDir != "" {
 		app.Static("/generated-images", options.ImageDir)
@@ -196,16 +224,16 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)
-	// models
+	// Experimental Backend Statistics Module
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cm))
+	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
-	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cm))
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
 	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))
-	// turn off any process that was started by GRPC if the context is canceled
+	// models
-	go func() {
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
-		<-options.Context.Done()
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
-		log.Debug().Msgf("Context canceled, shutting down")
+
-		options.Loader.StopGRPC()
+	app.Get("/metrics", metrics.MetricsHandler())
 	}()
 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -15,6 +15,7 @@ import (
 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -162,8 +163,12 @@ var _ = Describe("API test", func() {
 				},
 			}
 			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithMetrics(metricsService),
 					options.WithContext(c),
 					options.WithGalleries(galleries),
 					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
@@ -296,7 +301,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -359,6 +364,82 @@ var _ = Describe("API test", func() {
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs openllama gguf", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				modelName := "codellama"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
 					Name:      modelName,
 					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
 				uuid := response["uuid"].(string)
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))
 				By("testing chat")
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
 					{
 						Role:    "user",
 						Content: "How much is 2+2?",
 					},
 				}})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")))
 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: modelName,
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
 								Content: "What is the weather like in San Francisco (celsius)?",
 							},
 						},
 						Functions: []openai.FunctionDefinition{
 							openai.FunctionDefinition{
 								Name:        "get_current_weather",
 								Description: "Get the current weather",
 								Parameters: jsonschema.Definition{
 									Type: jsonschema.Object,
 									Properties: map[string]jsonschema.Definition{
 										"location": {
 											Type:        jsonschema.String,
 											Description: "The city and state, e.g. San Francisco, CA",
 										},
 										"unit": {
 											Type: jsonschema.String,
 											Enum: []string{"celcius", "fahrenheit"},
 										},
 									},
 									Required: []string{"location"},
 								},
 							},
 						},
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs gpt4all", Label("gpt4all"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
@@ -376,7 +457,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "960s", "10s").Should(Equal(true))
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -403,9 +484,13 @@ var _ = Describe("API test", func() {
 				},
 			}
 			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
 					options.WithMetrics(metricsService),
 					options.WithAudioDir(tmpdir),
 					options.WithImageDir(tmpdir),
 					options.WithGalleries(galleries),
@@ -507,12 +592,15 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())
-			var err error
+			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					options.WithContext(c),
 					options.WithModelLoader(modelLoader),
 					options.WithMetrics(metricsService),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
@@ -599,7 +687,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred())
+			Expect(err).ToNot(HaveOccurred(), err)
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
@@ -716,10 +804,13 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())
-			var err error
+			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
 					options.WithMetrics(metricsService),
 					options.WithModelLoader(modelLoader),
 					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -2,7 +2,6 @@ package backend
 import (
 	"fmt"
 	"sync"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -22,17 +21,13 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	var inferenceModel interface{}
 	var err error
-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
+	})
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	if c.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
@@ -80,18 +75,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	}
 	return func() ([]float32, error) {
 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 		mutexMap.Lock()
 		l, ok := mutexes[modelFile]
 		if !ok {
 			m := &sync.Mutex{}
 			mutexes[modelFile] = m
 			l = m
 		}
 		mutexMap.Unlock()
 		l.Lock()
 		defer l.Unlock()
 		embeds, err := fn()
 		if err != nil {
 			return embeds, err
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -1,17 +1,15 @@
 package backend
 import (
 	"sync"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )
-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithThreads(uint32(c.Threads)),
@@ -21,12 +19,16 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			CUDA:          c.Diffusers.CUDA,
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
 			LoraAdapter:   c.LoraAdapter,
 			LoraScale:     c.LoraScale,
 			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
 			CLIPSkip:      int32(c.Diffusers.ClipSkip),
 		}),
-	}
+	})
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	inferenceModel, err := loader.BackendLoader(
 		opts...,
@@ -39,31 +41,20 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		_, err := inferenceModel.GenerateImage(
 			o.Context,
 			&proto.GenerateImageRequest{
-				Height:         int32(height),
+				Height:           int32(height),
-				Width:          int32(width),
+				Width:            int32(width),
-				Mode:           int32(mode),
+				Mode:             int32(mode),
-				Step:           int32(step),
+				Step:             int32(step),
-				Seed:           int32(seed),
+				Seed:             int32(seed),
-				PositivePrompt: positive_prompt,
+				CLIPSkip:         int32(c.Diffusers.ClipSkip),
-				NegativePrompt: negative_prompt,
+				PositivePrompt:   positive_prompt,
-				Dst:            dst,
+				NegativePrompt:   negative_prompt,
 				Dst:              dst,
 				Src:              src,
 				EnableParameters: c.Diffusers.EnableParameters,
 			})
 		return err
 	}
-	return func() error {
+	return fn, nil
 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 		mutexMap.Lock()
 		l, ok := mutexes[c.Backend]
 		if !ok {
 			m := &sync.Mutex{}
 			mutexes[c.Backend] = m
 			l = m
 		}
 		mutexMap.Unlock()
 		l.Lock()
 		defer l.Unlock()
 		return fn()
 	}, nil
 }
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
 	"unicode/utf8"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -15,7 +16,17 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )
-func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string) bool) (func() (string, error), error) {
+type LLMResponse struct {
 	Response string // should this be []byte?
 	Usage    TokenUsage
 }
 type TokenUsage struct {
 	Prompt     int
 	Completion int
 }
 func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	grpcOpts := gRPCModelOpts(c)
@@ -23,17 +34,13 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	var inferenceModel *grpc.Client
 	var err error
-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
+	})
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -62,40 +69,72 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	}
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
-	fn := func() (string, error) {
+	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Images = images
 		tokenUsage := TokenUsage{}
 		// check the per-model feature flag for usage, since tokenCallback may have a cost.
 		// Defaults to off as for now it is still experimental
 		if c.FeatureFlag.Enabled("usage") {
 			userTokenCallback := tokenCallback
 			if userTokenCallback == nil {
 				userTokenCallback = func(token string, usage TokenUsage) bool {
 					return true
 				}
 			}
 			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
 			if pErr == nil && promptInfo.Length > 0 {
 				tokenUsage.Prompt = int(promptInfo.Length)
 			}
 			tokenCallback = func(token string, usage TokenUsage) bool {
 				tokenUsage.Completion++
 				return userTokenCallback(token, tokenUsage)
 			}
 		}
 		if tokenCallback != nil {
 			ss := ""
-			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
+
-				tokenCallback(string(s))
+			var partialRune []byte
-				ss += string(s)
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
 				partialRune = append(partialRune, chars...)
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
 					if r == utf8.RuneError {
 						// incomplete rune, wait for more bytes
 						break
 					}
 					tokenCallback(string(r), tokenUsage)
 					ss += string(r)
 					partialRune = partialRune[size:]
 				}
 			})
-			return ss, err
+			return LLMResponse{
 				Response: ss,
 				Usage:    tokenUsage,
 			}, err
 		} else {
 			// TODO: Is the chicken bit the only way to get here? is that acceptable?
 			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
-				return "", err
+				return LLMResponse{}, err
 			}
-			return string(reply.Message), err
+			return LLMResponse{
 				Response: string(reply.Message),
 				Usage:    tokenUsage,
 			}, err
 		}
 	}
-	return func() (string, error) {
+	return fn, nil
 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 		mutexMap.Lock()
 		l, ok := mutexes[modelFile]
 		if !ok {
 			m := &sync.Mutex{}
 			mutexes[modelFile] = m
 			l = m
 		}
 		mutexMap.Unlock()
 		l.Lock()
 		defer l.Unlock()
 		return fn()
 	}, nil
 }
 var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
--- a/api/backend/lock.go
+++ b/api/backend/lock.go
@@ -1,22 +0,0 @@
 package backend
 import "sync"
 // mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 var mutexMap sync.Mutex
 var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
 func Lock(s string) *sync.Mutex {
 	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 	mutexMap.Lock()
 	l, ok := mutexes[s]
 	if !ok {
 		m := &sync.Mutex{}
 		mutexes[s] = m
 		l = m
 	}
 	mutexMap.Unlock()
 	l.Lock()
 	return l
 }
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -5,39 +5,75 @@ import (
 	"path/filepath"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 )
 func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
 	if o.SingleBackend {
 		opts = append(opts, model.WithSingleActiveBackend())
 	}
 	if c.GRPC.Attempts != 0 {
 		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
 	if c.GRPC.AttemptsSleepTime != 0 {
 		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	return opts
 }
 func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
 	return &pb.ModelOptions{
 		ContextSize: int32(c.ContextSize),
 		Seed:        int32(c.Seed),
 		NBatch:      int32(b),
 		NGQA:        c.NGQA,
-		RMSNormEps:    c.RMSNormEps,
+	return &pb.ModelOptions{
-		F16Memory:     c.F16,
+		ContextSize:    int32(c.ContextSize),
-		MLock:         c.MMlock,
+		Seed:           int32(c.Seed),
-		RopeFreqBase:  c.RopeFreqBase,
+		NBatch:         int32(b),
-		RopeFreqScale: c.RopeFreqScale,
+		NoMulMatQ:      c.NoMulMatQ,
-		NUMA:          c.NUMA,
+		DraftModel:     c.DraftModel,
-		Embeddings:    c.Embeddings,
+		AudioPath:      c.VallE.AudioPath,
-		LowVRAM:       c.LowVRAM,
+		Quantization:   c.Quantization,
-		NGPULayers:    int32(c.NGPULayers),
+		MMProj:         c.MMProj,
-		MMap:          c.MMap,
+		YarnExtFactor:  c.YarnExtFactor,
-		MainGPU:       c.MainGPU,
+		YarnAttnFactor: c.YarnAttnFactor,
-		Threads:       int32(c.Threads),
+		YarnBetaFast:   c.YarnBetaFast,
-		TensorSplit:   c.TensorSplit,
+		YarnBetaSlow:   c.YarnBetaSlow,
 		LoraAdapter:    c.LoraAdapter,
 		LoraBase:       c.LoraBase,
 		LoraScale:      c.LoraScale,
 		NGQA:           c.NGQA,
 		RMSNormEps:     c.RMSNormEps,
 		F16Memory:      c.F16,
 		MLock:          c.MMlock,
 		RopeFreqBase:   c.RopeFreqBase,
 		RopeFreqScale:  c.RopeFreqScale,
 		NUMA:           c.NUMA,
 		Embeddings:     c.Embeddings,
 		LowVRAM:        c.LowVRAM,
 		NGPULayers:     int32(c.NGPULayers),
 		MMap:           c.MMap,
 		MainGPU:        c.MainGPU,
 		Threads:        int32(c.Threads),
 		TensorSplit:    c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
 		UseTriton:        c.AutoGPTQ.Triton,
 		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
 }
@@ -51,6 +87,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 	return &pb.PredictOptions{
 		Temperature:         float32(c.Temperature),
 		TopP:                float32(c.TopP),
 		NDraft:              c.NDraft,
 		TopK:                int32(c.TopK),
 		Tokens:              int32(c.Maxtokens),
 		Threads:             int32(c.Threads),
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -5,25 +5,22 @@ import (
 	"fmt"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )
-func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*api.Result, error) {
+func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
-	opts := []model.Option{
+
 	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
 		model.WithModel(c.Model),
 		model.WithContext(o.Context),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
-	}
+	})
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	whisperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -6,6 +6,7 @@ import (
 	"os"
 	"path/filepath"
 	api_config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -33,17 +34,12 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
 	if bb == "" {
 		bb = model.PiperBackend
 	}
-	opts := []model.Option{
+	opts := modelOpts(api_config.Config{}, o, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 		model.WithAssetDir(o.AssetsDestination),
-	}
+	})
 	for k, v := range o.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}
 	piperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -29,6 +29,7 @@ type Config struct {
 	FunctionsConfig Functions `yaml:"function"`
 	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`
@@ -39,12 +40,40 @@ type Config struct {
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step int `yaml:"step"`
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
 	// Vall-e-x
 	VallE VallE `yaml:"vall-e"`
 }
 type VallE struct {
 	AudioPath string `yaml:"audio_path"`
 }
 type FeatureFlag map[string]*bool
 func (ff FeatureFlag) Enabled(s string) bool {
 	v, exist := ff[s]
 	return exist && v != nil && *v
 }
 type GRPC struct {
 	Attempts          int `yaml:"attempts"`
 	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
 }
 type Diffusers struct {
-	PipelineType  string `yaml:"pipeline_type"`
+	PipelineType     string  `yaml:"pipeline_type"`
-	SchedulerType string `yaml:"scheduler_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
-	CUDA          bool   `yaml:"cuda"`
+	CUDA             bool    `yaml:"cuda"`
 	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
 	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
 	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
 	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
 	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
 	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
 }
 type LLMConfig struct {
@@ -69,6 +98,20 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	ContextSize     int      `yaml:"context_size"`
 	NUMA            bool     `yaml:"numa"`
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
 	LoraScale       float32  `yaml:"lora_scale"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
 	DraftModel      string   `yaml:"draft_model"`
 	NDraft          int32    `yaml:"n_draft"`
 	Quantization    string   `yaml:"quantization"`
 	MMProj          string   `yaml:"mmproj"`
 	RopeScaling    string  `yaml:"rope_scaling"`
 	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
 	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
 	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }
 type AutoGPTQ struct {
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -41,4 +41,10 @@ type PredictionOptions struct {
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
 	// AutoGPTQ
 	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
 	// Diffusers
 	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
 	// RWKV (?)
 	Tokenizer string `json:"tokenizer" yaml:"tokenizer"`
 }
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -0,0 +1,163 @@
 package localai
 import (
 	"context"
 	"fmt"
 	"strings"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )
 type BackendMonitorRequest struct {
 	Model string `json:"model" yaml:"model"`
 }
 type BackendMonitorResponse struct {
 	MemoryInfo    *gopsutil.MemoryInfoStat
 	MemoryPercent float32
 	CPUPercent    float64
 }
 type BackendMonitor struct {
 	configLoader *config.ConfigLoader
 	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
 }
 func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
 	return BackendMonitor{
 		configLoader: configLoader,
 		options:      options,
 	}
 }
 func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
 	config, exists := bm.configLoader.GetConfig(model)
 	var backend string
 	if exists {
 		backend = config.Model
 	} else {
 		// Last ditch effort: use it raw, see if a backend happens to match.
 		backend = model
 	}
 	if !strings.HasSuffix(backend, ".bin") {
 		backend = fmt.Sprintf("%s.bin", backend)
 	}
 	pid, err := bm.options.Loader.GetGRPCPID(backend)
 	if err != nil {
 		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
 		return nil, err
 	}
 	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
 	backendProcess, err := gopsutil.NewProcess(int32(pid))
 	if err != nil {
 		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
 		return nil, err
 	}
 	memInfo, err := backendProcess.MemoryInfo()
 	if err != nil {
 		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
 		return nil, err
 	}
 	memPercent, err := backendProcess.MemoryPercent()
 	if err != nil {
 		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
 		return nil, err
 	}
 	cpuPercent, err := backendProcess.CPUPercent()
 	if err != nil {
 		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
 		return nil, err
 	}
 	return &BackendMonitorResponse{
 		MemoryInfo:    memInfo,
 		MemoryPercent: memPercent,
 		CPUPercent:    cpuPercent,
 	}, nil
 }
 func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
 	input := new(BackendMonitorRequest)
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
 		return "", err
 	}
 	config, exists := bm.configLoader.GetConfig(input.Model)
 	var backendId string
 	if exists {
 		backendId = config.Model
 	} else {
 		// Last ditch effort: use it raw, see if a backend happens to match.
 		backendId = input.Model
 	}
 	if !strings.HasSuffix(backendId, ".bin") {
 		backendId = fmt.Sprintf("%s.bin", backendId)
 	}
 	return backendId, nil
 }
 func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		backendId, err := bm.getModelLoaderIDFromCtx(c)
 		if err != nil {
 			return err
 		}
 		client := bm.options.Loader.CheckIsLoaded(backendId)
 		if client == nil {
 			return fmt.Errorf("backend %s is not currently loaded", backendId)
 		}
 		status, rpcErr := client.Status(context.TODO())
 		if rpcErr != nil {
 			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
 			val, slbErr := bm.SampleLocalBackendProcess(backendId)
 			if slbErr != nil {
 				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
 			}
 			return c.JSON(proto.StatusResponse{
 				State: proto.StatusResponse_ERROR,
 				Memory: &proto.MemoryUsageData{
 					Total: val.MemoryInfo.VMS,
 					Breakdown: map[string]uint64{
 						"gopsutil-RSS": val.MemoryInfo.RSS,
 					},
 				},
 			})
 		}
 		return c.JSON(status)
 	}
 }
 func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		backendId, err := bm.getModelLoaderIDFromCtx(c)
 		if err != nil {
 			return err
 		}
 		return bm.options.Loader.ShutdownModel(backendId)
 	}
 }
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
 	"slices"
 	"strings"
 	"sync"
@@ -27,6 +28,7 @@ type galleryOp struct {
 }
 type galleryOpStatus struct {
 	FileName           string  `json:"file_name"`
 	Error              error   `json:"error"`
 	Processed          bool    `json:"processed"`
 	Message            string  `json:"message"`
@@ -50,7 +52,6 @@ func NewGalleryService(modelPath string) *galleryApplier {
 	}
 }
 // prepareModel applies a
 func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
 	config, err := gallery.GetGalleryConfigFromURL(req.URL)
@@ -76,6 +77,13 @@ func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
 	return g.statuses[s]
 }
 func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
 	g.Lock()
 	defer g.Unlock()
 	return g.statuses
 }
 func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 	go func() {
 		for {
@@ -94,7 +102,7 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 				// displayDownload displays the download progress
 				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
 					utils.DisplayDownloadFunction(fileName, current, total, percentage)
 				}
@@ -176,18 +184,12 @@ func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galler
 	return processRequests(modelPath, s, cm, galleries, requests)
 }
-/// Endpoints
+/// Endpoint Service
-func GetOpStatusEndpoint(g *galleryApplier) func(c *fiber.Ctx) error {
+type ModelGalleryService struct {
-	return func(c *fiber.Ctx) error {
+	galleries      []gallery.Gallery
-
+	modelPath      string
-		status := g.getStatus(c.Params("uuid"))
+	galleryApplier *galleryApplier
 		if status == nil {
 			return fmt.Errorf("could not find any status for ID")
 		}
 		return c.JSON(status)
 	}
 }
 type GalleryModel struct {
@@ -195,7 +197,31 @@ type GalleryModel struct {
 	gallery.GalleryModel
 }
-func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan galleryOp, galleries []gallery.Gallery) func(c *fiber.Ctx) error {
+func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
 	return ModelGalleryService{
 		galleries:      galleries,
 		modelPath:      modelPath,
 		galleryApplier: galleryApplier,
 	}
 }
 func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
 		if status == nil {
 			return fmt.Errorf("could not find any status for ID")
 		}
 		return c.JSON(status)
 	}
 }
 func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		return c.JSON(mgs.galleryApplier.getAllStatus())
 	}
 }
 func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(GalleryModel)
 		// Get input data from the request body
@@ -207,11 +233,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 		if err != nil {
 			return err
 		}
-		g <- galleryOp{
+		mgs.galleryApplier.C <- galleryOp{
 			req:         input.GalleryModel,
 			id:          uuid.String(),
 			galleryName: input.ID,
-			galleries:   galleries,
+			galleries:   mgs.galleries,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
@@ -220,11 +246,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 	}
 }
-func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string) func(c *fiber.Ctx) error {
+func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", galleries)
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
-		models, err := gallery.AvailableGalleryModels(galleries, basePath)
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
@@ -239,3 +265,56 @@ func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string)
 		return c.Send(dat)
 	}
 }
 // NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
 func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
 		dat, err := json.Marshal(mgs.galleries)
 		if err != nil {
 			return err
 		}
 		return c.Send(dat)
 	}
 }
 func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(gallery.Gallery)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
 			return gallery.Name == input.Name
 		}) {
 			return fmt.Errorf("%s already exists", input.Name)
 		}
 		dat, err := json.Marshal(mgs.galleries)
 		if err != nil {
 			return err
 		}
 		log.Debug().Msgf("Adding %+v to gallery list", *input)
 		mgs.galleries = append(mgs.galleries, *input)
 		return c.Send(dat)
 	}
 }
 func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(gallery.Gallery)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
 			return gallery.Name == input.Name
 		}) {
 			return fmt.Errorf("%s is not currently registered", input.Name)
 		}
 		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
 			return gallery.Name == input.Name
 		})
 		return c.Send(nil)
 	}
 }
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -6,34 +6,48 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		initialMessage := OpenAIResponse{
+		initialMessage := schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []Choice{{Delta: &Message{Role: "assistant", Content: &emptyMessage}}},
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
 		}
 		responses <- initialMessage
-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			resp := OpenAIResponse{
+			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Delta: &Message{Content: &s}, Index: 0}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
 				Usage: schema.OpenAIUsage{
 					PromptTokens:     usage.Prompt,
 					CompletionTokens: usage.Completion,
 					TotalTokens:      usage.Prompt + usage.Completion,
 				},
 			}
 			responses <- resp
@@ -67,6 +81,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}
 		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
@@ -126,14 +144,14 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 			}
 			r := config.Roles[role]
-			contentExists := i.Content != nil && *i.Content != ""
+			contentExists := i.Content != nil && i.StringContent != ""
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
 					SystemPrompt: config.SystemPrompt,
 					Role:         r,
 					RoleName:     role,
-					Content:      *i.Content,
+					Content:      i.StringContent,
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
@@ -152,7 +170,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			if content == "" {
 				if r != "" {
 					if contentExists {
-						content = fmt.Sprint(r, " ", *i.Content)
+						content = fmt.Sprint(r, i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -166,7 +184,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					}
 				} else {
 					if contentExists {
-						content = fmt.Sprint(*i.Content)
+						content = fmt.Sprint(i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -231,17 +249,19 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}
 		if toStream {
-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)
 			go process(predInput, input, config, o.Loader, responses)
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 				usage := &schema.OpenAIUsage{}
 				for ev := range responses {
 					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
 					enc.Encode(ev)
 					log.Debug().Msgf("Sending chunk: %s", buf.String())
 					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
 					if err != nil {
@@ -252,15 +272,18 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					w.Flush()
 				}
-				resp := &OpenAIResponse{
+				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
-					Choices: []Choice{
+					Created: created,
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: "stop",
 							Index:        0,
-							Delta:        &Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: &emptyMessage},
 						}},
 					Object: "chat.completion.chunk",
 					Usage:  *usage,
 				}
 				respData, _ := json.Marshal(resp)
@@ -271,7 +294,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return nil
 		}
-		result, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]Choice) {
+		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
 			if processFunctions {
 				// As we have to change the result before processing, we can't stream the answer (yet?)
 				ss := map[string]interface{}{}
@@ -305,7 +328,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 								message = backend.Finetune(*config, predInput, message)
 								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-								*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &message}})
+								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
 								return
 							}
 						}
@@ -315,7 +338,11 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
+					images := []string{}
 					for _, m := range input.Messages {
 						images = append(images, m.StringImages...)
 					}
 					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
@@ -327,28 +354,35 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						return
 					}
-					prediction = backend.Finetune(*config, predInput, prediction)
+					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
-					*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &prediction}})
+					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
 				} else {
 					// otherwise reply with the function call
-					*c = append(*c, Choice{
+					*c = append(*c, schema.Choice{
 						FinishReason: "function_call",
-						Message:      &Message{Role: "assistant", FunctionCall: ss},
+						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
 					})
 				}
 				return
 			}
-			*c = append(*c, Choice{FinishReason: "stop", Index: 0, Message: &Message{Role: "assistant", Content: &s}})
+			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
 		}, nil)
 		if err != nil {
 			return err
 		}
-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
 			Usage: schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
 				CompletionTokens: tokenUsage.Completion,
 				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
 			},
 		}
 		respData, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", respData)
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -6,28 +6,43 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
+	id := uuid.New().String()
-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+	created := int(time.Now().Unix())
-			resp := OpenAIResponse{
+
-				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-				Choices: []Choice{
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{
 					{
 						Index: 0,
 						Text:  s,
 					},
 				},
 				Object: "text_completion",
 				Usage: schema.OpenAIUsage{
 					PromptTokens:     usage.Prompt,
 					CompletionTokens: usage.Completion,
 					TotalTokens:      usage.Prompt + usage.Completion,
 				},
 			}
 			log.Debug().Msgf("Sending goroutine: %s", s)
@@ -50,6 +65,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
 		if input.Stream {
@@ -84,7 +103,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}
-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)
 			go process(predInput, input, config, o.Loader, responses)
@@ -100,9 +119,11 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 					w.Flush()
 				}
-				resp := &OpenAIResponse{
+				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
-					Choices: []Choice{
+					Created: created,
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							Index:        0,
 							FinishReason: "stop",
@@ -119,7 +140,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return nil
 		}
-		var result []Choice
+		var result []schema.Choice
 		totalTokenUsage := backend.TokenUsage{}
 		for k, i := range config.PromptStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
@@ -131,20 +155,31 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
-			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
+			r, tokenUsage, err := ComputeChoices(
-				*c = append(*c, Choice{Text: s, FinishReason: "stop", Index: k})
+				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
-			}, nil)
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
 				}, nil)
 			if err != nil {
 				return err
 			}
 			totalTokenUsage.Prompt += tokenUsage.Prompt
 			totalTokenUsage.Completion += tokenUsage.Completion
 			result = append(result, r...)
 		}
-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
 			Usage: schema.OpenAIUsage{
 				PromptTokens:     totalTokenUsage.Prompt,
 				CompletionTokens: totalTokenUsage.Completion,
 				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
 			},
 		}
 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -3,11 +3,16 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 )
@@ -31,7 +36,9 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			templateFile = config.TemplateConfig.Edit
 		}
-		var result []Choice
+		var result []schema.Choice
 		totalTokenUsage := backend.TokenUsage{}
 		for _, i := range config.InputStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
@@ -44,20 +51,32 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
-			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
+			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
-				*c = append(*c, Choice{Text: s})
+				*c = append(*c, schema.Choice{Text: s})
 			}, nil)
 			if err != nil {
 				return err
 			}
 			totalTokenUsage.Prompt += tokenUsage.Prompt
 			totalTokenUsage.Completion += tokenUsage.Completion
 			result = append(result, r...)
 		}
-		resp := &OpenAIResponse{
+		id := uuid.New().String()
 		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
 			Usage: schema.OpenAIUsage{
 				PromptTokens:     totalTokenUsage.Prompt,
 				CompletionTokens: totalTokenUsage.Completion,
 				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
 			},
 		}
 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -3,9 +3,13 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/google/uuid"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -25,7 +29,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []Item{}
+		items := []schema.Item{}
 		for i, s := range config.InputToken {
 			// get the model function to call for the result
@@ -38,7 +42,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}
 		for i, s := range config.InputStrings {
@@ -52,13 +56,17 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}
-		resp := &OpenAIResponse{
+		id := uuid.New().String()
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
+		created := int(time.Now().Unix())
-			Data:   items,
+		resp := &schema.OpenAIResponse{
-			Object: "list",
+			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Data:    items,
 			Object:  "list",
 		}
 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -1,6 +1,7 @@
 package openai
 import (
 	"bufio"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
@@ -8,6 +9,10 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/google/uuid"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -49,6 +54,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		src := ""
 		if input.File != "" {
 			//base 64 decode the file and write it somewhere
 			// that we will cleanup
 			decoded, err := base64.StdEncoding.DecodeString(input.File)
 			if err != nil {
 				return err
 			}
 			// Create a temporary file
 			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
 			if err != nil {
 				return err
 			}
 			// write the base64 result
 			writer := bufio.NewWriter(outputFile)
 			_, err = writer.Write(decoded)
 			if err != nil {
 				outputFile.Close()
 				return err
 			}
 			outputFile.Close()
 			src = outputFile.Name()
 			defer os.RemoveAll(src)
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
 		// XXX: Only stablediffusion is supported for now
@@ -73,8 +103,8 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		if input.ResponseFormat == "b64_json" {
 			b64JSON = true
 		}
-
+		// src and clip_skip
-		var result []Item
+		var result []schema.Item
 		for _, i := range config.PromptStrings {
 			n := input.N
 			if input.N == 0 {
@@ -121,7 +151,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 				baseURL := c.BaseURL()
-				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, output, o.Loader, *config, o)
+				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, o.Loader, *config, o)
 				if err != nil {
 					return err
 				}
@@ -129,7 +159,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 					return err
 				}
-				item := &Item{}
+				item := &schema.Item{}
 				if b64JSON {
 					defer os.RemoveAll(output)
@@ -147,8 +177,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}
-		resp := &OpenAIResponse{
+		id := uuid.New().String()
-			Data: result,
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Data:    result,
 		}
 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -4,34 +4,52 @@ import (
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )
-func ComputeChoices(req *OpenAIRequest, predInput string, config *config.Config, o *options.Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
+func ComputeChoices(
-	n := req.N
+	req *schema.OpenAIRequest,
-	result := []Choice{}
+	predInput string,
 	config *config.Config,
 	o *options.Option,
 	loader *model.ModelLoader,
 	cb func(string, *[]schema.Choice),
 	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
 	n := req.N // number of completions to return
 	result := []schema.Choice{}
 	if n == 0 {
 		n = 1
 	}
-	// get the model function to call for the result
+	images := []string{}
-	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
+	for _, m := range req.Messages {
-	if err != nil {
+		images = append(images, m.StringImages...)
 		return result, err
 	}
 	// get the model function to call for the result
 	predFunc, err := backend.ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
 	tokenUsage := backend.TokenUsage{}
 	for i := 0; i < n; i++ {
 		prediction, err := predFunc()
 		if err != nil {
-			return result, err
+			return result, backend.TokenUsage{}, err
 		}
-		prediction = backend.Finetune(*config, predInput, prediction)
+		tokenUsage.Prompt += prediction.Usage.Prompt
-		cb(prediction, &result)
+		tokenUsage.Completion += prediction.Usage.Completion
 		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
 		cb(finetunedResponse, &result)
 		//result = append(result, Choice{Text: prediction})
 	}
-	return result, err
+	return result, tokenUsage, err
 }
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -4,6 +4,7 @@ import (
 	"regexp"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 )
@@ -16,7 +17,7 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		}
 		var mm map[string]interface{} = map[string]interface{}{}
-		dataModels := []OpenAIModel{}
+		dataModels := []schema.OpenAIModel{}
 		var filterFn func(name string) bool
 		filter := c.Query("filter")
@@ -45,7 +46,7 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 			}
 			if filterFn(c.Name) {
-				dataModels = append(dataModels, OpenAIModel{ID: c.Name, Object: "model"})
+				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
 			}
 		}
@@ -53,13 +54,13 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		for _, m := range models {
 			// And only adds them if they shouldn't be skipped.
 			if _, exists := mm[m]; !exists && filterFn(m) {
-				dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 			}
 		}
 		return c.JSON(struct {
-			Object string        `json:"object"`
+			Object string               `json:"object"`
-			Data   []OpenAIModel `json:"data"`
+			Data   []schema.OpenAIModel `json:"data"`
 		}{
 			Object: "list",
 			Data:   dataModels,
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -2,28 +2,32 @@ package openai
 import (
 	"context"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
 	config "github.com/go-skynet/LocalAI/api/config"
 	options "github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )
-func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *OpenAIRequest, error) {
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
 	loader := o.Loader
-	input := new(OpenAIRequest)
+	input := new(schema.OpenAIRequest)
 	ctx, cancel := context.WithCancel(o.Context)
 	input.Context = ctx
 	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 	}
 	modelFile := input.Model
@@ -60,7 +64,38 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *Open
 	return modelFile, input, nil
 }
-func updateConfig(config *config.Config, input *OpenAIRequest) {
+// this function check if the string is an URL, if it's an URL downloads the image in memory
 // encodes it in base64 and returns the base64 string
 func getBase64Image(s string) (string, error) {
 	if strings.HasPrefix(s, "http") {
 		// download the image
 		resp, err := http.Get(s)
 		if err != nil {
 			return "", err
 		}
 		defer resp.Body.Close()
 		// read the image data into memory
 		data, err := ioutil.ReadAll(resp.Body)
 		if err != nil {
 			return "", err
 		}
 		// encode the image data in base64
 		encoded := base64.StdEncoding.EncodeToString(data)
 		// return the base64 string
 		return encoded, nil
 	}
 	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
 	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
 		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
 	}
 	return "", fmt.Errorf("not valid string")
 }
 func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
 	}
@@ -75,6 +110,10 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		config.Backend = input.Backend
 	}
 	if input.ClipSkip != 0 {
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}
 	if input.ModelBaseName != "" {
 		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
 	}
@@ -124,6 +163,35 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		}
 	}
 	// Decode each request's message content
 	index := 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
 			input.Messages[i].StringContent = content
 		case []interface{}:
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
 			for _, pp := range c {
 				if pp.Type == "text" {
 					input.Messages[i].StringContent = pp.Text
 				} else if pp.Type == "image_url" {
 					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
 					base64, err := getBase64Image(pp.ImageURL.URL)
 					if err == nil {
 						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
 						// set a placeholder for each image
 						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
 						index++
 					} else {
 						fmt.Print("Failed encoding image", err)
 					}
 				}
 			}
 		}
 	}
 	if input.RepeatPenalty != 0 {
 		config.RepeatPenalty = input.RepeatPenalty
 	}
@@ -214,7 +282,7 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 	}
 }
-func readConfig(modelFile string, input *OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *OpenAIRequest, error) {
+func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
 	// Load a config file if present after the model name
 	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -7,6 +7,7 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )
@@ -24,6 +25,7 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
 	Metrics                             *metrics.Metrics
 	Galleries []gallery.Gallery
@@ -33,6 +35,8 @@ type Option struct {
 	ExternalGRPCBackends map[string]string
 	AutoloadGalleries bool
 	SingleBackend bool
 }
 type AppOption func(*Option)
@@ -58,6 +62,10 @@ func WithCors(b bool) AppOption {
 	}
 }
 var EnableSingleBackend = func(o *Option) {
 	o.SingleBackend = true
 }
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
@@ -93,6 +101,7 @@ func WithStringGalleries(galls string) AppOption {
 	return func(o *Option) {
 		if galls == "" {
 			log.Debug().Msgf("no galleries to load")
 			o.Galleries = []gallery.Gallery{}
 			return
 		}
 		var galleries []gallery.Gallery
@@ -191,3 +200,9 @@ func WithApiKeys(apiKeys []string) AppOption {
 		o.ApiKeys = apiKeys
 	}
 }
 func WithMetrics(meter *metrics.Metrics) AppOption {
 	return func(o *Option) {
 		o.Metrics = meter
 	}
 }
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -1,4 +1,4 @@
-package openai
+package schema
 import (
 	"context"
@@ -55,11 +55,25 @@ type Choice struct {
 	Text         string   `json:"text,omitempty"`
 }
 type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
 }
 type ContentURL struct {
 	URL string `json:"url" yaml:"url"`
 }
 type Message struct {
 	// The message role
 	Role string `json:"role,omitempty" yaml:"role"`
 	// The message content
-	Content *string `json:"content" yaml:"content"`
+	Content interface{} `json:"content" yaml:"content"`
 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
 }
--- a/pkg/grpc/whisper/api/api.go
+++ b/pkg/grpc/whisper/api/api.go
@@ -1,4 +1,4 @@
-package api
+package schema
 import "time"
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
 installed_packages/
 grpc_build/
 grpc_repo/
--- a/backend/cpp/grpc/script/build_grpc.sh
+++ b/backend/cpp/grpc/script/build_grpc.sh
@@ -0,0 +1,81 @@
 #!/bin/bash
 # Builds locally from sources the packages needed by the llama cpp backend.
 # Makes sure a few base packages exist.
 # sudo apt-get --no-upgrade -y install g++ gcc binutils cmake git build-essential autoconf libtool pkg-config 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 echo "Script directory: $SCRIPT_DIR"
 CPP_INSTALLED_PACKAGES_DIR=$1
 if [ -z ${CPP_INSTALLED_PACKAGES_DIR} ]; then 
    echo "CPP_INSTALLED_PACKAGES_DIR env variable not set. Don't know where to install: failed."; 
    echo
    exit -1
 fi
 if [ -d "${CPP_INSTALLED_PACKAGES_DIR}" ]; then
  echo "gRPC installation directory already exists. Nothing to do."
  exit 0
 fi
 # The depth when cloning a git repo. 1 speeds up the clone when the repo history is not needed.
 GIT_CLONE_DEPTH=1
 NUM_BUILD_THREADS=$(nproc --ignore=1)
 # Google gRPC --------------------------------------------------------------------------------------
 TAG_LIB_GRPC="v1.59.0"
 GIT_REPO_LIB_GRPC="https://github.com/grpc/grpc.git"
 GRPC_REPO_DIR="${SCRIPT_DIR}/../grpc_repo"
 GRPC_BUILD_DIR="${SCRIPT_DIR}/../grpc_build"
 SRC_DIR_LIB_GRPC="${GRPC_REPO_DIR}/grpc"
 echo "SRC_DIR_LIB_GRPC: ${SRC_DIR_LIB_GRPC}"
 echo "GRPC_REPO_DIR: ${GRPC_REPO_DIR}"
 echo "GRPC_BUILD_DIR: ${GRPC_BUILD_DIR}"
 mkdir -pv ${GRPC_REPO_DIR}
 rm   -rf ${GRPC_BUILD_DIR}
 mkdir -pv ${GRPC_BUILD_DIR}
 mkdir -pv ${CPP_INSTALLED_PACKAGES_DIR}
 if [ -d "${SRC_DIR_LIB_GRPC}" ]; then
  echo "gRPC source already exists locally. Not cloned again."
 else  
  ( cd ${GRPC_REPO_DIR} && \
    git clone --depth ${GIT_CLONE_DEPTH} -b ${TAG_LIB_GRPC} ${GIT_REPO_LIB_GRPC} && \
    cd ${SRC_DIR_LIB_GRPC} && \
    git submodule update --init --recursive --depth ${GIT_CLONE_DEPTH} 
  )    
 fi
 ( cd ${GRPC_BUILD_DIR} && \
  cmake -G "Unix Makefiles" \
     -DCMAKE_BUILD_TYPE=Release \
     -DgRPC_INSTALL=ON \
     -DEXECUTABLE_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/bin \
     -DLIBRARY_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/lib \
     -DgRPC_BUILD_TESTS=OFF \
     -DgRPC_BUILD_CSHARP_EXT=OFF \
     -DgRPC_BUILD_GRPC_CPP_PLUGIN=ON \
     -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON \
     -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \
     -Dprotobuf_WITH_ZLIB=ON \
     -DRE2_BUILD_TESTING=OFF \
     -DCMAKE_INSTALL_PREFIX=${CPP_INSTALLED_PACKAGES_DIR}/ \
     ${SRC_DIR_LIB_GRPC}  && \
  cmake --build .  -- -j ${NUM_BUILD_THREADS} && \
  cmake --build .  --target install -- -j ${NUM_BUILD_THREADS} 
 )
 rm -rf ${GRPC_BUILD_DIR}
 rm -rf ${GRPC_REPO_DIR}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,74 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 set(TARGET grpc-server)
 # END CLIP hack
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
 set(TARGET grpc-server)
 set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 set(_REFLECTION grpc++_reflection)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    link_directories("/opt/homebrew/lib")
    include_directories("/opt/homebrew/include")
 endif()
 find_package(absl CONFIG REQUIRED)
 find_package(Protobuf CONFIG REQUIRED)
 find_package(gRPC CONFIG REQUIRED)
 find_program(_PROTOBUF_PROTOC protoc)
 set(_GRPC_GRPCPP grpc++)
 find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${Protobuf_INCLUDE_DIRS})
 message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
 # Proto file
 get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
 get_filename_component(hw_proto_path "${hw_proto}" PATH)
 # Generated sources
 set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
 set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
 set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
 set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
 add_custom_command(
      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
      COMMAND ${_PROTOBUF_PROTOC}
      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
        -I "${hw_proto_path}"
        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
        "${hw_proto}"
      DEPENDS "${hw_proto}")
 # hw_grpc_proto
 add_library(hw_grpc_proto
  ${hw_grpc_srcs}
  ${hw_grpc_hdrs}
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )
 add_executable(${TARGET} grpc-server.cpp json.hpp )
 target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
  protobuf::${_PROTOBUF_LIBPROTOBUF})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,50 @@
 LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
 CMAKE_ARGS?=
 BUILD_TYPE?=
 # If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
 # If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblast)
 	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
 endif
 llama.cpp:
 	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
 llama.cpp/examples/grpc-server:
 	mkdir -p llama.cpp/examples/grpc-server
 	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
 	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
 	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
 rebuild:
 	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
 	rm -rf grpc-server
 	$(MAKE) grpc-server
 clean:
 	rm -rf llama.cpp
 	rm -rf grpc-server
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -5,8 +5,8 @@ package main
 import (
 	"flag"
 	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 	bert "github.com/go-skynet/LocalAI/pkg/grpc/llm/bert"
 )
 var (
--- a/cmd/grpc/dolly/main.go
+++ b/cmd/grpc/dolly/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon-ggml/main.go
+++ b/cmd/grpc/falcon-ggml/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -1,25 +0,0 @@
 package main
 // GRPC Falcon server
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	falcon "github.com/go-skynet/LocalAI/pkg/grpc/llm/falcon"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/cmd/grpc/gpt2/main.go
+++ b/cmd/grpc/gpt2/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gpt4all/main.go
+++ b/cmd/grpc/gpt4all/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	gpt4all "github.com/go-skynet/LocalAI/pkg/grpc/llm/gpt4all"
+	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptj/main.go
+++ b/cmd/grpc/gptj/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptneox/main.go
+++ b/cmd/grpc/gptneox/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/langchain-huggingface/main.go
+++ b/cmd/grpc/langchain-huggingface/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	langchain "github.com/go-skynet/LocalAI/pkg/grpc/llm/langchain"
+	langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/llama-stable/main.go
+++ b/cmd/grpc/llama-stable/main.go
@@ -1,11 +1,9 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
-	bloomz "github.com/go-skynet/LocalAI/pkg/grpc/llm/bloomz"
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
@@ -17,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/cmd/grpc/llama/main.go
+++ b/cmd/grpc/llama/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"
-	llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama"
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/mpt/main.go
+++ b/cmd/grpc/mpt/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/piper/main.go
+++ b/cmd/grpc/piper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	tts "github.com/go-skynet/LocalAI/pkg/grpc/tts"
+	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/replit/main.go
+++ b/cmd/grpc/replit/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/rwkv/main.go
+++ b/cmd/grpc/rwkv/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	rwkv "github.com/go-skynet/LocalAI/pkg/grpc/llm/rwkv"
+	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/stablediffusion/main.go
+++ b/cmd/grpc/stablediffusion/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	image "github.com/go-skynet/LocalAI/pkg/grpc/image"
+	image "github.com/go-skynet/LocalAI/pkg/backend/image"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/starcoder/main.go
+++ b/cmd/grpc/starcoder/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/whisper/main.go
+++ b/cmd/grpc/whisper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"
-	transcribe "github.com/go-skynet/LocalAI/pkg/grpc/transcribe"
+	transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/custom-ca-certs/.keep
+++ b/custom-ca-certs/.keep
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
      - ./images/:/tmp/generated/images/
    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -5,7 +5,7 @@ cd /build
 if [ "$REBUILD" != "false" ]; then
 	rm -rf ./local-ai
-	ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build -j${BUILD_PARALLELISM:-1}
+	make build -j${BUILD_PARALLELISM:-1}
 else
 	echo "@@@@@"
 	echo "Skipping rebuild"
--- a/examples/README.md
+++ b/examples/README.md
@@ -157,6 +157,26 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:
 [Check it out here](https://runpod.io/gsc?template=uv9mtqnrd0&ref=984wlcra)
 ### Continue
 _by [@gruberdev](https://github.com/gruberdev)_
 <img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
 Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.
 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)
 ### Streamlit bot
 _by [@majoshi1](https://github.com/majoshi1)_
 ![Screenshot](streamlit-bot/streamlit-bot.png)
 A chat bot made using `Streamlit` & LocalAI.
 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/streamlit-bot/)
 ## Want to contribute?
 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -1,5 +1,9 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
 # see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -10,12 +10,16 @@ git clone https://github.com/go-skynet/LocalAI
 cd LocalAI/examples/autoGPT
 cp -rfv .env.example .env
 # Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
 vim .env
 docker-compose run --rm auto-gpt
 ```
 Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).
 See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
 ## Without docker
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: backend monitor
  type: http
  seq: 4
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
  body: none
  auth: none
 }
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
 meta {
  name: backend-shutdown
  type: http
  seq: 3
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
 {
  "version": "1",
  "name": "LocalAI Test Requests",
  "type": "collection"
 }
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
 vars {
  HOST: localhost
  PORT: 8080
  DEFAULT_MODEL: gpt-3.5-turbo
  PROTOCOL: http://
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: get models list
  type: http
  seq: 2
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,25 @@
 meta {
  name: Generate image
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "prompt": "<positive prompt>|<negative prompt>",
    "model": "model-name",
    "step": 51,
    "size": "1024x1024",
    "image": ""
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
 meta {
  name: -completions
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "prompt": "function downloadFile(string url, string outputPath) {",
      "max_tokens": 256,
      "temperature": 0.5
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
 meta {
  name: -edits
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "What day of the wek is it?",
      "instruction": "Fix the spelling mistakes"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: -embeddings
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,30 @@
 meta {
  name: chat completion -simple- 1 message-
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}",
    "messages": [
      {
        "role": "user",
        "content": "How could one use friction to cook an egg?"
      }
    ],
    "max_tokens": 256,
    "temperature": 0.2,
    "grammar": ""
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
 meta {
  name: chat-completions -long-
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
           {"role": "user", "content": "How could one use electricity to cook an egg?"},
           {"role": "assistant",
                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
              },
              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
       "max_tokens": 1024,
       "temperature": 0.5
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
 meta {
  name: chat-completions -stream-
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
       "max_tokens": 256,
       "temperature": 0.9,
       "stream": true
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: add model gallery
  type: http
  seq: 10
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
      "name": "test"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
 meta {
  name: delete model gallery
  type: http
  seq: 11
 }
 delete {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "name": "test"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: list MODELS in galleries
  type: http
  seq: 7
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: list model GALLERIES
  type: http
  seq: 8
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
 meta {
  name: model gallery apply -gist-
  type: http
  seq: 12
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: model gallery apply
  type: http
  seq: 9
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
      "name": "codellama7b"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: -tts
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/examples/chainlit/Dockerfile
+++ b/examples/chainlit/Dockerfile
@@ -0,0 +1,16 @@
 # Use an official Python runtime as a parent image
 FROM harbor.home.sfxworks.net/docker/library/python:3.9-slim
 # Set the working directory in the container
 WORKDIR /app
 # Copy the current directory contents into the container at /app
 COPY requirements.txt /app
 # Install any needed packages specified in requirements.txt
 RUN pip install -r requirements.txt
 COPY . /app
 # Run app.py when the container launches
 CMD ["chainlit", "run", "-h", "--host", "0.0.0.0", "main.py" ]
--- a/examples/chainlit/README.md
+++ b/examples/chainlit/README.md
@@ -0,0 +1,25 @@
 # LocalAI Demonstration with Embeddings and Chainlit
 This demonstration shows you how to use embeddings with existing data in `LocalAI`, and how to integrate it with Chainlit for an interactive querying experience. We are using the `llama_index` library to facilitate the embedding and querying processes, and `chainlit` to provide an interactive interface. The `Weaviate` client is used as the embedding source.
 ## Prerequisites
 Before proceeding, make sure you have the following installed:
 - Weaviate client
 - LocalAI and its dependencies
 - Chainlit and its dependencies
 ## Getting Started
 1. Clone this repository:
 2. Navigate to the project directory:
 3. Run the example: `chainlit run main.py`
 # Highlight on `llama_index` and `chainlit`
 `llama_index` is the key library that facilitates the process of embedding and querying data in LocalAI. It provides a seamless interface to integrate various components, such as `WeaviateVectorStore`, `LocalAI`, `ServiceContext`, and more, for a smooth querying experience.
 `chainlit` is used to provide an interactive interface for users to query the data and see the results in real-time. It integrates with llama_index to handle the querying process and display the results to the user.
 In this example, `llama_index` is used to set up the `VectorStoreIndex` and `QueryEngine`, and `chainlit` is used to handle the user interactions with `LocalAI` and display the results.
--- a/examples/chainlit/config.yaml
+++ b/examples/chainlit/config.yaml
@@ -0,0 +1,16 @@
 localAI:
  temperature: 0
  modelName: gpt-3.5-turbo
  apiBase: http://local-ai.default
  apiKey: stub
  streaming: True
 weviate:
  url: http://weviate.local
  index: AIChroma
 query:
  mode: hybrid
  topK: 1
  alpha: 0.0
  chunkSize: 1024
 embedding:
  model: BAAI/bge-small-en-v1.5
--- a/examples/chainlit/main.py
+++ b/examples/chainlit/main.py
@@ -0,0 +1,82 @@
 import os
 import weaviate
 from llama_index.storage.storage_context import StorageContext
 from llama_index.vector_stores import WeaviateVectorStore
 from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
 from llama_index.callbacks.base import CallbackManager
 from llama_index import (
    LLMPredictor,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
 )
 import chainlit as cl
 from llama_index.llms import LocalAI
 from llama_index.embeddings import HuggingFaceEmbedding
 import yaml
 # Load the configuration file
 with open("config.yaml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)
 # Get the values from the configuration file or set the default values
 temperature = cfg['localAI'].get('temperature', 0)
 model_name = cfg['localAI'].get('modelName', "gpt-3.5-turbo")
 api_base = cfg['localAI'].get('apiBase', "http://local-ai.default")
 api_key = cfg['localAI'].get('apiKey', "stub")
 streaming = cfg['localAI'].get('streaming', True)
 weaviate_url = cfg['weviate'].get('url', "http://weviate.default")
 index_name = cfg['weviate'].get('index', "AIChroma")
 query_mode = cfg['query'].get('mode', "hybrid")
 topK = cfg['query'].get('topK', 1)
 alpha = cfg['query'].get('alpha', 0.0)
 embed_model_name = cfg['embedding'].get('model', "BAAI/bge-small-en-v1.5")
 chunk_size = cfg['query'].get('chunkSize', 1024)
 embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
 llm = LocalAI(temperature=temperature, model_name=model_name, api_base=api_base, api_key=api_key, streaming=streaming)
 llm.globally_use_chat_completions = True;
 client = weaviate.Client(weaviate_url)
 vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name)
 storage_context = StorageContext.from_defaults(vector_store=vector_store)
@cl.on_chat_start
 async def factory():
    llm_predictor = LLMPredictor(
        llm=llm
    )
    service_context = ServiceContext.from_defaults(embed_model=embed_model, callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]), llm_predictor=llm_predictor, chunk_size=chunk_size)
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        storage_context=storage_context,
        service_context=service_context
    )
    query_engine = index.as_query_engine(vector_store_query_mode=query_mode, similarity_top_k=topK, alpha=alpha, streaming=True)
    cl.user_session.set("query_engine", query_engine)
@cl.on_message
 async def main(message: cl.Message):
    query_engine = cl.user_session.get("query_engine")
    response = await cl.make_async(query_engine.query)(message.content)
    response_message = cl.Message(content="")
    for token in response.response_gen:
        await response_message.stream_token(token=token)
    if response.response_txt:
        response_message.content = response.response_txt
    await response_message.send()
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -0,0 +1,7 @@
 llama_hub==0.0.41
 llama_index==0.8.55
 Requests==2.31.0
 weaviate_client==3.25.1
 transformers
 torch
 chainlit
--- a/examples/chatbot-ui-manual/README.md
+++ b/examples/chatbot-ui-manual/README.md
@@ -24,10 +24,13 @@ docker-compose up -d --pull always
 # docker-compose up -d --build
 ```
 Then browse to `http://localhost:3000` to view the Web UI.
 ## Pointing chatbot-ui to a separately managed LocalAI service
-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+
 ```yaml
 version: '3.6'
 services:
@@ -40,9 +43,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```
-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.
 ## Accessing chatbot-ui
 Open http://localhost:3000 for the Web UI.
--- a/examples/chatbot-ui-manual/models
+++ b/examples/chatbot-ui-manual/models
@@ -0,0 +1 @@
 ../models
--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -20,10 +20,13 @@ docker-compose up --pull always
 # docker-compose up -d --build
 ```
 Then browse to `http://localhost:3000` to view the Web UI.
 ## Pointing chatbot-ui to a separately managed LocalAI service
-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+
 ```yaml
 version: '3.6'
 services:
@@ -36,9 +39,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```
-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.
 ## Accessing chatbot-ui
 Open http://localhost:3000 for the Web UI.
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -0,0 +1,42 @@
 ## Advanced configuration
 This section contains examples on how to install models manually with config files.
 ### Prerequisites
 First clone LocalAI:
 ```bash
 git clone https://github.com/go-skynet/LocalAI
 cd LocalAI
 ```
 Setup the model you prefer from the examples below and then start LocalAI:
 ```bash
 docker compose up -d --pull always
 ```
 If LocalAI is already started, you can restart it with 
 ```bash
 docker compose restart
 ```
 See also the getting started: https://localai.io/basics/getting_started/
 ### Mistral
 To setup mistral copy the files inside `mistral` in the `models` folder:
 ```bash
 cp -r examples/configurations/mistral/* models/
 ```
 Now download the model:
 ```bash
 wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
 ```
--- a/examples/configurations/llava/README.md
+++ b/examples/configurations/llava/README.md
@@ -0,0 +1,18 @@
 ![llava](https://github.com/mudler/LocalAI/assets/2420543/cb0a0897-3b58-4350-af66-e6f4387b58d3)
 ## Setup
 ```
 mkdir models
 wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/ggml-model-q4_k.gguf -O models/ggml-model-q4_k.gguf
 wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/mmproj-model-f16.gguf -O models/mmproj-model-f16.gguf
 docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-ai:master --models-path /models --threads 4
 ```
 ## Try it out
 ```
 curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
     "model": "llava",
     "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
 ```
--- a/examples/configurations/llava/chat-simple.tmpl
+++ b/examples/configurations/llava/chat-simple.tmpl
@@ -0,0 +1,3 @@
 A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
 {{.Input}}
 ASSISTANT:
--- a/examples/configurations/llava/llava.yaml
+++ b/examples/configurations/llava/llava.yaml
@@ -0,0 +1,20 @@
 context_size: 4096
 f16: true
 threads: 11
 gpu_layers: 90
 name: llava
 mmap: true
 backend: llama-cpp
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 parameters:
  model: ggml-model-q4_k.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
 template:
  chat: chat-simple
 mmproj: mmproj-model-f16.gguf
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -0,0 +1,3 @@
 {{.Input}}
 <|im_start|>assistant
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -0,0 +1,3 @@
 <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
 {{if .Content}}{{.Content}}{{end}}
 <|im_end|>
--- a/examples/chatbot-ui-manual/models/completion.tmpl
+++ b/examples/chatbot-ui-manual/models/completion.tmpl
--- a/Show More
+++ b/Show More
`@@ -1,4 +1,4 @@`
	`package api`	`package schema`

	`import "time"`	`import "time"`