fixup: create piper libdir also when not built

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Merge branch 'master' into enable_gpu
2026-05-24 08:38:02 -04:00 · 2023-11-12 22:17:11 +01:00 · 2023-11-11 19:20:36 +01:00 · 2023-11-11 18:40:48 +01:00 · 2023-11-11 18:40:26 +01:00 · 2023-11-11 13:14:59 +01:00
165 changed files with 30714 additions and 1132 deletions
--- a/.env
+++ b/.env
@@ -67,3 +67,6 @@ MODELS_PATH=/models
 ### Default number of workers for GRPC Python backends.
 ### This actually controls wether a backend can process multiple requests or not.
 # PYTHON_GRPC_MAX_WORKERS=1
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -41,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -14,15 +14,21 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  docker:
+  image-build:
    strategy:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -37,11 +43,6 @@ jobs:
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -57,43 +58,54 @@ jobs:
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
-    runs-on: ubuntu-latest
+    runs-on: arc-runner-set 
    steps:
-      - name: Release space from worker
+      - name: Force Install GIT latest
        run: |
-          echo "Listing top largest packages"
+          sudo apt-get update \
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          && sudo apt-get install -y software-properties-common \
-          head -n 30 <<< "${pkgs}"
+          && sudo apt-get update \
-          echo
+          && sudo add-apt-repository -y ppa:git-core/ppa \
-          df -h
+          && sudo apt-get update \
-          echo
+          && sudo apt-get install -y git
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          df -h
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
-
+      # - name: Release space from worker
      #   run: |
      #     echo "Listing top largest packages"
      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
      #     head -n 30 <<< "${pkgs}"
      #     echo
      #     df -h
      #     echo
      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
      #     sudo rm -rf /usr/local/lib/android
      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
      #     sudo rm -rf /usr/share/dotnet
      #     sudo apt-get remove -y '^mono-.*' || true
      #     sudo apt-get remove -y '^ghc-.*' || true
      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
      #     sudo apt-get remove -y 'php.*' || true
      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
      #     sudo apt-get remove -y '^google-.*' || true
      #     sudo apt-get remove -y azure-cli || true
      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
      #     sudo apt-get remove -y '^gfortran-.*' || true
      #     sudo apt-get remove -y microsoft-edge-stable || true
      #     sudo apt-get remove -y firefox || true
      #     sudo apt-get remove -y powershell || true
      #     sudo apt-get remove -y r-base-core || true
      #     sudo apt-get autoremove -y
      #     sudo apt-get clean
      #     echo
      #     echo "Listing top largest packages"
      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
      #     head -n 30 <<< "${pkgs}"
      #     echo
      #     sudo rm -rfv build || true
      #     df -h
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
@@ -29,6 +29,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && sudo make -j12 install
      - name: Build
        id: build
        env:
@@ -60,18 +66,26 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -0,0 +1,63 @@
 ---
 name: 'GPU tests'
 on:
  pull_request:
  push:
    branches:
      - master
    tags:
      - '*'
 concurrency:
  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
 jobs:
  ubuntu-latest:
    runs-on: gpu
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
      - name: Build
        run: |
          if [ ! -e /run/systemd/system ]; then
            sudo mkdir /run/systemd/system
          fi
          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            BUILD_TYPE=cublas \
            prepare-e2e run-e2e-image test-e2e
      - name: Release space from worker ♻
        if: always()
        run: |
          sudo rm -rf build || true
          sudo rm -rf bin || true
          sudo rm -rf dist || true
          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
          sudo cat logs.txt || true
          sudo rm -rf logs.txt
          make clean || true
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            teardown-e2e || true
          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,14 +14,46 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  ubuntu-latest:
+  tests-linux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Release space from worker
        run: |
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -35,38 +67,43 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          sudo pip install -r extra/requirements.txt
-          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
+          sudo rm -rfv /usr/bin/conda || true
-          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
+          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface
-          tar -xzvf - && \
+
-          mkdir -p "spdlog-1.11.0/build" && \
+          # Pre-build piper before we start tests in order to have shared libraries in place
-          cd "spdlog-1.11.0/build" && \
+          make go-piper && \
-          cmake ..  && \
+          GO_TAGS="tts" make -C go-piper piper.o && \
-          make -j8 && \
+          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
-          sudo cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
+
-          cd /build && \
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
+          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
+
-          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
+                -DgRPC_BUILD_TESTS=OFF \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+                ../.. && sudo make -j12 install
      - name: Test
        run: |
-          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
+          GO_TAGS="stablediffusion tts" make test
-  macOS-latest:
+  tests-apple:
    runs-on: macOS-latest
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -76,6 +113,14 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
                -DgRPC_BUILD_TESTS=OFF \
                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@ go-ggllm
 __pycache__/
 *.a
 get-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp
 go-ggml-transformers
 go-gpt2
--- a/159
+++ b/159
@@ -1,22 +1,27 @@
 ARG GO_VERSION=1.21-bullseye
 ARG IMAGE_TYPE=extras
 # extras or core
-FROM golang:$GO_VERSION as requirements
+
 FROM golang:$GO_VERSION as requirements-core
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
 ARG SPDLOG_VERSION="1.11.0"
 ARG PIPER_PHONEMIZE_VERSION='1.0.0'
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
 ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"
 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@@ -30,66 +35,62 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get install -y \
    libopenblas-dev \
    libopencv-dev \ 
    && apt-get clean
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 # Extras requirements
 FROM requirements-core as requirements-extras
 RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
    apt-get install -y conda
 COPY extra/requirements.txt /build/extra/requirements.txt
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
+#    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
+ #   fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
 # Vall-e-X
 RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
 WORKDIR /build
 # OpenBLAS requirements
 RUN apt-get install -y libopenblas-dev
 # Stable Diffusion requirements
 RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
 #RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \
    tar -xzvf - && \
    mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \
    cd "spdlog-${SPDLOG_VERSION}/build" && \
    cmake ..  && \
    make -j8 && \
    cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
    cd /build && \
    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
 # \
 #    ; fi
 ###################################
 ###################################
-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder
 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
 ARG BUILD_GRPC=true
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@@ -104,21 +105,43 @@ RUN make prepare
 COPY . .
 COPY .git .
-RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
+# stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
 RUN if [ "${BUILD_GRPC}" = "true" ]; then \
    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
       ../.. && make -j12 install && rm -rf grpc \
    ; fi
 # Rebuild with defaults backends
 RUN make build
 RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
    mkdir -p /build/go-piper/piper/build/pi/lib/ \
    touch /build/go-piper/piper/build/pi/lib/keep \
    ; fi
 ###################################
 ###################################
-FROM requirements
+FROM requirements-${IMAGE_TYPE}
 ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
 ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
@@ -132,15 +155,49 @@ WORKDIR /build
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
 # Copy the binary
 COPY --from=builder /build/local-ai ./
-# Copy VALLE-X as it's not a real "lib"
+# Copy shared libraries for piper
-RUN cp -rfv /usr/lib/vall-e-x/* ./
+COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
-# To resolve exllama import error
+# do not let stablediffusion rebuild (requires an older version of absl)
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
    ; fi
 # Copy VALLE-X as it's not a real "lib"
 RUN if [ -d /usr/lib/vall-e-x ]; then \
    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
    fi
 # we also copy exllama libs over to resolve exllama import error
 RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/179
+++ b/179
@@ -4,10 +4,12 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 # llama.cpp versions
-GOLLAMA_VERSION?=d9f6176409de0a2b5ce798de502545c6721e346e
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
 CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
@@ -26,24 +28,23 @@ WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
-PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7
+PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78
 # go-bloomz version
 BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
 # Go-ggllm
 GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git
 TEST_DIR=/tmp/test
 RANDOM := $(shell bash -c 'echo $$RANDOM')
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@@ -51,7 +52,6 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION
 override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
 OPTIONAL_TARGETS?=
 ESPEAK_DATA?=
 OS := $(shell uname -s)
 ARCH := $(shell uname -m)
@@ -61,6 +61,9 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)
 # Default Docker bridge IP
 E2E_BRIDGE_IP?=172.17.0.1
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -82,6 +85,18 @@ ifeq ($(BUILD_TYPE),cublas)
 	export LLAMA_CUBLAS=1
 endif
 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# Llama-stable has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
 endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
@@ -104,10 +119,18 @@ endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
 	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif
-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 # If empty, then we build all
 ifeq ($(GRPC_BACKENDS),)
 	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
 endif
 .PHONY: all test build vendor
@@ -118,14 +141,6 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
 ## go-ggllm
 go-ggllm:
 	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
 	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
 go-ggllm/libggllm.a: go-ggllm
 	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -152,14 +167,6 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 ## bloomz
 bloomz:
 	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
 	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
 bloomz/libbloomz.a: bloomz
 	cd bloomz && make libbloomz.a
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a
@@ -169,14 +176,10 @@ backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
-backend-assets/espeak-ng-data:
+backend-assets/espeak-ng-data: go-piper
 	mkdir -p backend-assets/espeak-ng-data
-ifdef ESPEAK_DATA
+	$(MAKE) -C go-piper piper.o
-	@cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data
+	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
 else
 	@echo "ESPEAK_DATA not set, skipping tts. Note that this will break the tts functionality."
 	@touch backend-assets/espeak-ng-data/keep
 endif
 gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
@@ -208,12 +211,12 @@ go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
 go-llama-stable/libbinding.a: go-llama-stable
-	$(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-go-piper/libpiper_binding.a:
+go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main
-get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@
 replace:
@@ -222,10 +225,8 @@ replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
 	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -241,9 +242,7 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
 	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
 	$(MAKE) -C go-ggllm clean
 	$(MAKE) build
 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -261,12 +260,14 @@ clean: ## Remove build related file
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
 	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
 	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	rm -rf ./backend/cpp/grpc/grpc_repo
 	rm -rf ./backend/cpp/grpc/build
 	rm -rf ./backend/cpp/grpc/installed_packages
 	$(MAKE) -C backend/cpp/llama clean
 ## Build:
@@ -289,12 +290,12 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
-	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
+	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
-	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
-	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
-	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models
 prepare-test: grpcs
@@ -305,14 +306,34 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
 prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
 	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
@@ -365,14 +386,20 @@ protogen-python:
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments:
 	$(MAKE) -C extra/grpc/autogptq
 	$(MAKE) -C extra/grpc/bark
 	$(MAKE) -C extra/grpc/diffusers
 	$(MAKE) -C extra/grpc/vllm
 	$(MAKE) -C extra/grpc/huggingface
 	$(MAKE) -C extra/grpc/vall-e-x
 	$(MAKE) -C extra/grpc/exllama
 backend-assets/grpc:
 	mkdir -p backend-assets/grpc
 backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -382,6 +409,37 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif
 ## BACKEND CPP LLAMA START
 # Sets the variables in case it has to build the gRPC locally.
 INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
 backend/cpp/llama/grpc-server:
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	backend/cpp/grpc/script/build_grpc.sh ${INSTALLED_PACKAGES}
 	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
 	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
 	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
 	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
 endif
 ## BACKEND CPP LLAMA END
 ##
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif
 backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -427,10 +485,6 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
 backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
@@ -438,12 +492,15 @@ backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
+backend-assets/grpc/stablediffusion: backend-assets/grpc
 	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
 		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
 		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
 	fi
 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
 backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
--- a/api/api.go
+++ b/api/api.go
@@ -11,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"
 	"github.com/gofiber/fiber/v2"
@@ -120,6 +121,9 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	// Default middleware config
 	app.Use(recover.New())
 	if options.Metrics != nil {
 		app.Use(metrics.APIMiddleware(options.Metrics))
 	}
 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
@@ -229,5 +233,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
 	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
 	app.Get("/metrics", metrics.MetricsHandler())
 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -15,6 +15,7 @@ import (
 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -162,8 +163,12 @@ var _ = Describe("API test", func() {
 				},
 			}
 			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithMetrics(metricsService),
 					options.WithContext(c),
 					options.WithGalleries(galleries),
 					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
@@ -452,7 +457,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "960s", "10s").Should(Equal(true))
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -479,9 +484,13 @@ var _ = Describe("API test", func() {
 				},
 			}
 			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
 					options.WithMetrics(metricsService),
 					options.WithAudioDir(tmpdir),
 					options.WithImageDir(tmpdir),
 					options.WithGalleries(galleries),
@@ -583,12 +592,15 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())
-			var err error
+			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					options.WithContext(c),
 					options.WithModelLoader(modelLoader),
 					options.WithMetrics(metricsService),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
@@ -675,7 +687,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred())
+			Expect(err).ToNot(HaveOccurred(), err)
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
@@ -792,10 +804,13 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())
-			var err error
+			metricsService, err := metrics.SetupMetrics()
 			Expect(err).ToNot(HaveOccurred())
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
 					options.WithMetrics(metricsService),
 					options.WithModelLoader(modelLoader),
 					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -21,6 +21,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
 			LoraAdapter:   c.LoraAdapter,
 			LoraScale:     c.LoraScale,
 			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -26,7 +26,7 @@ type TokenUsage struct {
 	Completion int
 }
-func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	grpcOpts := gRPCModelOpts(c)
@@ -72,6 +72,7 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Images = images
 		tokenUsage := TokenUsage{}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -45,8 +45,14 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		DraftModel:     c.DraftModel,
 		AudioPath:      c.VallE.AudioPath,
 		Quantization:   c.Quantization,
 		MMProj:         c.MMProj,
 		YarnExtFactor:  c.YarnExtFactor,
 		YarnAttnFactor: c.YarnAttnFactor,
 		YarnBetaFast:   c.YarnBetaFast,
 		YarnBetaSlow:   c.YarnBetaSlow,
 		LoraAdapter:    c.LoraAdapter,
 		LoraBase:       c.LoraBase,
 		LoraScale:      c.LoraScale,
 		NGQA:           c.NGQA,
 		RMSNormEps:     c.RMSNormEps,
 		F16Memory:      c.F16,
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -100,10 +100,18 @@ type LLMConfig struct {
 	NUMA            bool     `yaml:"numa"`
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
 	LoraScale       float32  `yaml:"lora_scale"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
 	DraftModel      string   `yaml:"draft_model"`
 	NDraft          int32    `yaml:"n_draft"`
 	Quantization    string   `yaml:"quantization"`
 	MMProj          string   `yaml:"mmproj"`
 	RopeScaling    string  `yaml:"rope_scaling"`
 	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
 	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
 	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }
 type AutoGPTQ struct {
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -15,15 +16,20 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
@@ -32,6 +38,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
@@ -73,6 +81,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}
 		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
@@ -132,14 +144,14 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 			}
 			r := config.Roles[role]
-			contentExists := i.Content != nil && *i.Content != ""
+			contentExists := i.Content != nil && i.StringContent != ""
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
 					SystemPrompt: config.SystemPrompt,
 					Role:         r,
 					RoleName:     role,
-					Content:      *i.Content,
+					Content:      i.StringContent,
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
@@ -158,7 +170,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			if content == "" {
 				if r != "" {
 					if contentExists {
-						content = fmt.Sprint(r, " ", *i.Content)
+						content = fmt.Sprint(r, i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -172,7 +184,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					}
 				} else {
 					if contentExists {
-						content = fmt.Sprint(*i.Content)
+						content = fmt.Sprint(i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -261,6 +273,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 				resp := &schema.OpenAIResponse{
 					ID:      id,
 					Created: created,
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
@@ -324,7 +338,11 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
+					images := []string{}
 					for _, m := range input.Messages {
 						images = append(images, m.StringImages...)
 					}
 					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
@@ -355,6 +373,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -6,22 +6,30 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{
 					{
@@ -57,6 +65,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
 		if input.Stream {
@@ -108,6 +120,8 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				}
 				resp := &schema.OpenAIResponse{
 					ID:      id,
 					Created: created,
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
@@ -156,6 +170,8 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -3,6 +3,7 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -10,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 )
@@ -62,7 +64,11 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			result = append(result, r...)
 		}
 		id := uuid.New().String()
 		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -3,10 +3,12 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/google/uuid"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
@@ -57,7 +59,11 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}
 		id := uuid.New().String()
 		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Data:    items,
 			Object:  "list",
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,11 +5,14 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/google/uuid"
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -174,7 +177,11 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}
 		id := uuid.New().String()
 		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Data:    result,
 		}
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -23,8 +23,13 @@ func ComputeChoices(
 		n = 1
 	}
 	images := []string{}
 	for _, m := range req.Messages {
 		images = append(images, m.StringImages...)
 	}
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -2,8 +2,11 @@ package openai
 import (
 	"context"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
@@ -24,7 +27,7 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 	}
 	modelFile := input.Model
@@ -61,6 +64,37 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	return modelFile, input, nil
 }
 // this function check if the string is an URL, if it's an URL downloads the image in memory
 // encodes it in base64 and returns the base64 string
 func getBase64Image(s string) (string, error) {
 	if strings.HasPrefix(s, "http") {
 		// download the image
 		resp, err := http.Get(s)
 		if err != nil {
 			return "", err
 		}
 		defer resp.Body.Close()
 		// read the image data into memory
 		data, err := ioutil.ReadAll(resp.Body)
 		if err != nil {
 			return "", err
 		}
 		// encode the image data in base64
 		encoded := base64.StdEncoding.EncodeToString(data)
 		// return the base64 string
 		return encoded, nil
 	}
 	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
 	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
 		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
 	}
 	return "", fmt.Errorf("not valid string")
 }
 func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
@@ -129,6 +163,35 @@ func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 		}
 	}
 	// Decode each request's message content
 	index := 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
 			input.Messages[i].StringContent = content
 		case []interface{}:
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
 			for _, pp := range c {
 				if pp.Type == "text" {
 					input.Messages[i].StringContent = pp.Text
 				} else if pp.Type == "image_url" {
 					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
 					base64, err := getBase64Image(pp.ImageURL.URL)
 					if err == nil {
 						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
 						// set a placeholder for each image
 						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
 						index++
 					} else {
 						fmt.Print("Failed encoding image", err)
 					}
 				}
 			}
 		}
 	}
 	if input.RepeatPenalty != 0 {
 		config.RepeatPenalty = input.RepeatPenalty
 	}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -7,6 +7,7 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )
@@ -24,6 +25,7 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
 	Metrics                             *metrics.Metrics
 	Galleries []gallery.Gallery
@@ -198,3 +200,9 @@ func WithApiKeys(apiKeys []string) AppOption {
 		o.ApiKeys = apiKeys
 	}
 }
 func WithMetrics(meter *metrics.Metrics) AppOption {
 	return func(o *Option) {
 		o.Metrics = meter
 	}
 }
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -55,11 +55,25 @@ type Choice struct {
 	Text         string   `json:"text,omitempty"`
 }
 type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
 }
 type ContentURL struct {
 	URL string `json:"url" yaml:"url"`
 }
 type Message struct {
 	// The message role
 	Role string `json:"role,omitempty" yaml:"role"`
 	// The message content
-	Content *string `json:"content" yaml:"content"`
+	Content interface{} `json:"content" yaml:"content"`
 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
 }
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
 installed_packages/
 grpc_build/
 grpc_repo/
--- a/backend/cpp/grpc/script/build_grpc.sh
+++ b/backend/cpp/grpc/script/build_grpc.sh
@@ -0,0 +1,81 @@
 #!/bin/bash
 # Builds locally from sources the packages needed by the llama cpp backend.
 # Makes sure a few base packages exist.
 # sudo apt-get --no-upgrade -y install g++ gcc binutils cmake git build-essential autoconf libtool pkg-config 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 echo "Script directory: $SCRIPT_DIR"
 CPP_INSTALLED_PACKAGES_DIR=$1
 if [ -z ${CPP_INSTALLED_PACKAGES_DIR} ]; then 
    echo "CPP_INSTALLED_PACKAGES_DIR env variable not set. Don't know where to install: failed."; 
    echo
    exit -1
 fi
 if [ -d "${CPP_INSTALLED_PACKAGES_DIR}" ]; then
  echo "gRPC installation directory already exists. Nothing to do."
  exit 0
 fi
 # The depth when cloning a git repo. 1 speeds up the clone when the repo history is not needed.
 GIT_CLONE_DEPTH=1
 NUM_BUILD_THREADS=$(nproc --ignore=1)
 # Google gRPC --------------------------------------------------------------------------------------
 TAG_LIB_GRPC="v1.59.0"
 GIT_REPO_LIB_GRPC="https://github.com/grpc/grpc.git"
 GRPC_REPO_DIR="${SCRIPT_DIR}/../grpc_repo"
 GRPC_BUILD_DIR="${SCRIPT_DIR}/../grpc_build"
 SRC_DIR_LIB_GRPC="${GRPC_REPO_DIR}/grpc"
 echo "SRC_DIR_LIB_GRPC: ${SRC_DIR_LIB_GRPC}"
 echo "GRPC_REPO_DIR: ${GRPC_REPO_DIR}"
 echo "GRPC_BUILD_DIR: ${GRPC_BUILD_DIR}"
 mkdir -pv ${GRPC_REPO_DIR}
 rm   -rf ${GRPC_BUILD_DIR}
 mkdir -pv ${GRPC_BUILD_DIR}
 mkdir -pv ${CPP_INSTALLED_PACKAGES_DIR}
 if [ -d "${SRC_DIR_LIB_GRPC}" ]; then
  echo "gRPC source already exists locally. Not cloned again."
 else  
  ( cd ${GRPC_REPO_DIR} && \
    git clone --depth ${GIT_CLONE_DEPTH} -b ${TAG_LIB_GRPC} ${GIT_REPO_LIB_GRPC} && \
    cd ${SRC_DIR_LIB_GRPC} && \
    git submodule update --init --recursive --depth ${GIT_CLONE_DEPTH} 
  )    
 fi
 ( cd ${GRPC_BUILD_DIR} && \
  cmake -G "Unix Makefiles" \
     -DCMAKE_BUILD_TYPE=Release \
     -DgRPC_INSTALL=ON \
     -DEXECUTABLE_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/bin \
     -DLIBRARY_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/lib \
     -DgRPC_BUILD_TESTS=OFF \
     -DgRPC_BUILD_CSHARP_EXT=OFF \
     -DgRPC_BUILD_GRPC_CPP_PLUGIN=ON \
     -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \
     -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON \
     -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \
     -Dprotobuf_WITH_ZLIB=ON \
     -DRE2_BUILD_TESTING=OFF \
     -DCMAKE_INSTALL_PREFIX=${CPP_INSTALLED_PACKAGES_DIR}/ \
     ${SRC_DIR_LIB_GRPC}  && \
  cmake --build .  -- -j ${NUM_BUILD_THREADS} && \
  cmake --build .  --target install -- -j ${NUM_BUILD_THREADS} 
 )
 rm -rf ${GRPC_BUILD_DIR}
 rm -rf ${GRPC_REPO_DIR}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,74 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 set(TARGET grpc-server)
 # END CLIP hack
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
 set(TARGET grpc-server)
 set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 set(_REFLECTION grpc++_reflection)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    link_directories("/opt/homebrew/lib")
    include_directories("/opt/homebrew/include")
 endif()
 find_package(absl CONFIG REQUIRED)
 find_package(Protobuf CONFIG REQUIRED)
 find_package(gRPC CONFIG REQUIRED)
 find_program(_PROTOBUF_PROTOC protoc)
 set(_GRPC_GRPCPP grpc++)
 find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${Protobuf_INCLUDE_DIRS})
 message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
 # Proto file
 get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
 get_filename_component(hw_proto_path "${hw_proto}" PATH)
 # Generated sources
 set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
 set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
 set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
 set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
 add_custom_command(
      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
      COMMAND ${_PROTOBUF_PROTOC}
      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
        -I "${hw_proto_path}"
        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
        "${hw_proto}"
      DEPENDS "${hw_proto}")
 # hw_grpc_proto
 add_library(hw_grpc_proto
  ${hw_grpc_srcs}
  ${hw_grpc_hdrs}
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )
 add_executable(${TARGET} grpc-server.cpp json.hpp )
 target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
  protobuf::${_PROTOBUF_LIBPROTOBUF})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,50 @@
 LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
 CMAKE_ARGS?=
 BUILD_TYPE?=
 # If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
 # If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblast)
 	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
 endif
 llama.cpp:
 	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
 llama.cpp/examples/grpc-server:
 	mkdir -p llama.cpp/examples/grpc-server
 	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
 	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
 	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
 rebuild:
 	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
 	rm -rf grpc-server
 	$(MAKE) grpc-server
 clean:
 	rm -rf llama.cpp
 	rm -rf grpc-server
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -1,23 +0,0 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -1,25 +0,0 @@
 package main
 // GRPC Falcon server
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/custom-ca-certs/.keep
+++ b/custom-ca-certs/.keep
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
      - ./images/:/tmp/generated/images/
    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -5,7 +5,7 @@ cd /build
 if [ "$REBUILD" != "false" ]; then
 	rm -rf ./local-ai
-	ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build -j${BUILD_PARALLELISM:-1}
+	make build -j${BUILD_PARALLELISM:-1}
 else
 	echo "@@@@@"
 	echo "Skipping rebuild"
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -1,5 +1,9 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
 # see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -10,12 +10,16 @@ git clone https://github.com/go-skynet/LocalAI
 cd LocalAI/examples/autoGPT
 cp -rfv .env.example .env
 # Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
 vim .env
 docker-compose run --rm auto-gpt
 ```
 Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).
 See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
 ## Without docker
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: backend monitor
  type: http
  seq: 4
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
  body: none
  auth: none
 }
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
 meta {
  name: backend-shutdown
  type: http
  seq: 3
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
 {
  "version": "1",
  "name": "LocalAI Test Requests",
  "type": "collection"
 }
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
 vars {
  HOST: localhost
  PORT: 8080
  DEFAULT_MODEL: gpt-3.5-turbo
  PROTOCOL: http://
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: get models list
  type: http
  seq: 2
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,25 @@
 meta {
  name: Generate image
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "prompt": "<positive prompt>|<negative prompt>",
    "model": "model-name",
    "step": 51,
    "size": "1024x1024",
    "image": ""
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
 meta {
  name: -completions
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "prompt": "function downloadFile(string url, string outputPath) {",
      "max_tokens": 256,
      "temperature": 0.5
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
 meta {
  name: -edits
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "What day of the wek is it?",
      "instruction": "Fix the spelling mistakes"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: -embeddings
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,30 @@
 meta {
  name: chat completion -simple- 1 message-
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}",
    "messages": [
      {
        "role": "user",
        "content": "How could one use friction to cook an egg?"
      }
    ],
    "max_tokens": 256,
    "temperature": 0.2,
    "grammar": ""
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
 meta {
  name: chat-completions -long-
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
           {"role": "user", "content": "How could one use electricity to cook an egg?"},
           {"role": "assistant",
                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
              },
              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
       "max_tokens": 1024,
       "temperature": 0.5
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
 meta {
  name: chat-completions -stream-
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
       "max_tokens": 256,
       "temperature": 0.9,
       "stream": true
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: add model gallery
  type: http
  seq: 10
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
      "name": "test"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
 meta {
  name: delete model gallery
  type: http
  seq: 11
 }
 delete {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "name": "test"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: list MODELS in galleries
  type: http
  seq: 7
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: list model GALLERIES
  type: http
  seq: 8
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
 meta {
  name: model gallery apply -gist-
  type: http
  seq: 12
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: model gallery apply
  type: http
  seq: 9
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
      "name": "codellama7b"
  }
 }
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
 meta {
  name: -tts
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/examples/chainlit/Dockerfile
+++ b/examples/chainlit/Dockerfile
@@ -0,0 +1,16 @@
 # Use an official Python runtime as a parent image
 FROM harbor.home.sfxworks.net/docker/library/python:3.9-slim
 # Set the working directory in the container
 WORKDIR /app
 # Copy the current directory contents into the container at /app
 COPY requirements.txt /app
 # Install any needed packages specified in requirements.txt
 RUN pip install -r requirements.txt
 COPY . /app
 # Run app.py when the container launches
 CMD ["chainlit", "run", "-h", "--host", "0.0.0.0", "main.py" ]
--- a/examples/chainlit/README.md
+++ b/examples/chainlit/README.md
@@ -0,0 +1,25 @@
 # LocalAI Demonstration with Embeddings and Chainlit
 This demonstration shows you how to use embeddings with existing data in `LocalAI`, and how to integrate it with Chainlit for an interactive querying experience. We are using the `llama_index` library to facilitate the embedding and querying processes, and `chainlit` to provide an interactive interface. The `Weaviate` client is used as the embedding source.
 ## Prerequisites
 Before proceeding, make sure you have the following installed:
 - Weaviate client
 - LocalAI and its dependencies
 - Chainlit and its dependencies
 ## Getting Started
 1. Clone this repository:
 2. Navigate to the project directory:
 3. Run the example: `chainlit run main.py`
 # Highlight on `llama_index` and `chainlit`
 `llama_index` is the key library that facilitates the process of embedding and querying data in LocalAI. It provides a seamless interface to integrate various components, such as `WeaviateVectorStore`, `LocalAI`, `ServiceContext`, and more, for a smooth querying experience.
 `chainlit` is used to provide an interactive interface for users to query the data and see the results in real-time. It integrates with llama_index to handle the querying process and display the results to the user.
 In this example, `llama_index` is used to set up the `VectorStoreIndex` and `QueryEngine`, and `chainlit` is used to handle the user interactions with `LocalAI` and display the results.
--- a/examples/chainlit/config.yaml
+++ b/examples/chainlit/config.yaml
@@ -0,0 +1,16 @@
 localAI:
  temperature: 0
  modelName: gpt-3.5-turbo
  apiBase: http://local-ai.default
  apiKey: stub
  streaming: True
 weviate:
  url: http://weviate.local
  index: AIChroma
 query:
  mode: hybrid
  topK: 1
  alpha: 0.0
  chunkSize: 1024
 embedding:
  model: BAAI/bge-small-en-v1.5
--- a/examples/chainlit/main.py
+++ b/examples/chainlit/main.py
@@ -0,0 +1,82 @@
 import os
 import weaviate
 from llama_index.storage.storage_context import StorageContext
 from llama_index.vector_stores import WeaviateVectorStore
 from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
 from llama_index.callbacks.base import CallbackManager
 from llama_index import (
    LLMPredictor,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
 )
 import chainlit as cl
 from llama_index.llms import LocalAI
 from llama_index.embeddings import HuggingFaceEmbedding
 import yaml
 # Load the configuration file
 with open("config.yaml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)
 # Get the values from the configuration file or set the default values
 temperature = cfg['localAI'].get('temperature', 0)
 model_name = cfg['localAI'].get('modelName', "gpt-3.5-turbo")
 api_base = cfg['localAI'].get('apiBase', "http://local-ai.default")
 api_key = cfg['localAI'].get('apiKey', "stub")
 streaming = cfg['localAI'].get('streaming', True)
 weaviate_url = cfg['weviate'].get('url', "http://weviate.default")
 index_name = cfg['weviate'].get('index', "AIChroma")
 query_mode = cfg['query'].get('mode', "hybrid")
 topK = cfg['query'].get('topK', 1)
 alpha = cfg['query'].get('alpha', 0.0)
 embed_model_name = cfg['embedding'].get('model', "BAAI/bge-small-en-v1.5")
 chunk_size = cfg['query'].get('chunkSize', 1024)
 embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
 llm = LocalAI(temperature=temperature, model_name=model_name, api_base=api_base, api_key=api_key, streaming=streaming)
 llm.globally_use_chat_completions = True;
 client = weaviate.Client(weaviate_url)
 vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name)
 storage_context = StorageContext.from_defaults(vector_store=vector_store)
@cl.on_chat_start
 async def factory():
    llm_predictor = LLMPredictor(
        llm=llm
    )
    service_context = ServiceContext.from_defaults(embed_model=embed_model, callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]), llm_predictor=llm_predictor, chunk_size=chunk_size)
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        storage_context=storage_context,
        service_context=service_context
    )
    query_engine = index.as_query_engine(vector_store_query_mode=query_mode, similarity_top_k=topK, alpha=alpha, streaming=True)
    cl.user_session.set("query_engine", query_engine)
@cl.on_message
 async def main(message: cl.Message):
    query_engine = cl.user_session.get("query_engine")
    response = await cl.make_async(query_engine.query)(message.content)
    response_message = cl.Message(content="")
    for token in response.response_gen:
        await response_message.stream_token(token=token)
    if response.response_txt:
        response_message.content = response.response_txt
    await response_message.send()
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -0,0 +1,7 @@
 llama_hub==0.0.41
 llama_index==0.8.55
 Requests==2.31.0
 weaviate_client==3.25.1
 transformers
 torch
 chainlit
--- a/examples/chatbot-ui-manual/models
+++ b/examples/chatbot-ui-manual/models
@@ -0,0 +1 @@
 ../models
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -0,0 +1,42 @@
 ## Advanced configuration
 This section contains examples on how to install models manually with config files.
 ### Prerequisites
 First clone LocalAI:
 ```bash
 git clone https://github.com/go-skynet/LocalAI
 cd LocalAI
 ```
 Setup the model you prefer from the examples below and then start LocalAI:
 ```bash
 docker compose up -d --pull always
 ```
 If LocalAI is already started, you can restart it with 
 ```bash
 docker compose restart
 ```
 See also the getting started: https://localai.io/basics/getting_started/
 ### Mistral
 To setup mistral copy the files inside `mistral` in the `models` folder:
 ```bash
 cp -r examples/configurations/mistral/* models/
 ```
 Now download the model:
 ```bash
 wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
 ```
--- a/examples/configurations/llava/README.md
+++ b/examples/configurations/llava/README.md
@@ -0,0 +1,18 @@
 ![llava](https://github.com/mudler/LocalAI/assets/2420543/cb0a0897-3b58-4350-af66-e6f4387b58d3)
 ## Setup
 ```
 mkdir models
 wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/ggml-model-q4_k.gguf -O models/ggml-model-q4_k.gguf
 wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/mmproj-model-f16.gguf -O models/mmproj-model-f16.gguf
 docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-ai:master --models-path /models --threads 4
 ```
 ## Try it out
 ```
 curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
     "model": "llava",
     "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
 ```
--- a/examples/configurations/llava/chat-simple.tmpl
+++ b/examples/configurations/llava/chat-simple.tmpl
@@ -0,0 +1,3 @@
 A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
 {{.Input}}
 ASSISTANT:
--- a/examples/configurations/llava/llava.yaml
+++ b/examples/configurations/llava/llava.yaml
@@ -0,0 +1,20 @@
 context_size: 4096
 f16: true
 threads: 11
 gpu_layers: 90
 name: llava
 mmap: true
 backend: llama-cpp
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 parameters:
  model: ggml-model-q4_k.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
 template:
  chat: chat-simple
 mmproj: mmproj-model-f16.gguf
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -0,0 +1,3 @@
 {{.Input}}
 <|im_start|>assistant
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -0,0 +1,3 @@
 <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
 {{if .Content}}{{.Content}}{{end}}
 <|im_end|>
--- a/examples/chatbot-ui-manual/models/completion.tmpl
+++ b/examples/chatbot-ui-manual/models/completion.tmpl
--- a/examples/configurations/mistral/mistral.yaml
+++ b/examples/configurations/mistral/mistral.yaml
@@ -0,0 +1,16 @@
 name: mistral
 mmap: true
 parameters:
  model: mistral-7b-openorca.Q6_K.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
 template:
  chat_message: chatml
  chat: chatml-block
  completion: completion
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 threads: 4
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -1,3 +1,6 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 OPENAI_API_KEY=x
 DISCORD_BOT_TOKEN=x
 DISCORD_CLIENT_ID=x
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -1 +1 @@
-../chatbot-ui/models/
+../models
--- a/examples/functions/.env.example
+++ b/examples/functions/.env.example
@@ -1,7 +1,11 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
 # see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/openllama-7b-open-instruct.yaml", "name": "gpt-3.5-turbo"}]
 ## Change the default number of threads
--- a/examples/functions/README.md
+++ b/examples/functions/README.md
@@ -10,9 +10,12 @@ git clone https://github.com/go-skynet/LocalAI
 cd LocalAI/examples/functions
 cp -rfv .env.example .env
 # Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
 vim .env
 docker-compose run --rm functions
 ```
 Note: The example automatically downloads the `openllama` model as it is under a permissive license.
 See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
--- a/examples/langchain-chroma/.env.example
+++ b/examples/langchain-chroma/.env.example
@@ -1,3 +1,6 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 THREADS=4
 CONTEXT_SIZE=512
 MODELS_PATH=/models
--- a/examples/langchain-chroma/models
+++ b/examples/langchain-chroma/models
@@ -0,0 +1 @@
 ../models
--- a/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
 name: gpt-3.5-turbo
 parameters:
  model: ggml-gpt4all-j
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 context_size: 1024
 stopwords:
 - "HUMAN:"
 - "GPT:"
 roles:
  user: " "
  system: " "
 template:
  completion: completion
  chat: gpt4all
--- a/examples/langchain-chroma/models/gpt4all.tmpl
+++ b/examples/langchain-chroma/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
 The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
 ### Prompt:
 {{.Input}}
 ### Response:
--- a/examples/langchain-huggingface/models
+++ b/examples/langchain-huggingface/models
@@ -0,0 +1 @@
 ../models
--- a/examples/langchain-huggingface/models/completion.tmpl
+++ b/examples/langchain-huggingface/models/completion.tmpl
@@ -1 +0,0 @@
 {{.Input}}
--- a/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
 name: gpt-3.5-turbo
 parameters:
  model: gpt2
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 context_size: 1024
 backend: "langchain-huggingface"
 stopwords:
 - "HUMAN:"
 - "GPT:"
 roles:
  user: " "
  system: " "
 template:
  completion: completion
  chat: gpt4all
--- a/examples/langchain-huggingface/models/gpt4all.tmpl
+++ b/examples/langchain-huggingface/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
 The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
 ### Prompt:
 {{.Input}}
 ### Response:
--- a/examples/langchain/models
+++ b/examples/langchain/models
@@ -0,0 +1 @@
 ../models
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -1 +0,0 @@
 {{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
 name: gpt-3.5-turbo
 parameters:
  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 context_size: 1024
 stopwords:
 - "HUMAN:"
 - "GPT:"
 roles:
  user: " "
  system: " "
 backend: "gptj"
 template:
  completion: completion
  chat: gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
 The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
 ### Prompt:
 {{.Input}}
 ### Response:
--- a/examples/llamaindex/README.md
+++ b/examples/llamaindex/README.md
@@ -0,0 +1,30 @@
 # LocalAI Demonstration with Embeddings
 This demonstration shows you how to use embeddings with existing data in LocalAI. We are using the `llama_index` library to facilitate the embedding and querying processes. The `Weaviate` client is used as the embedding source.
 ## Prerequisites
 Before proceeding, make sure you have the following installed:
 - Weaviate client
 - LocalAI and its dependencies
 - llama_index and its dependencies 
 ## Getting Started
 1. Clone this repository:
 2. Navigate to the project directory:
 3. Run the example:
 `python main.py`
 ```
 Downloading (…)lve/main/config.json: 100%|███████████████████████████| 684/684 [00:00<00:00, 6.01MB/s]
 Downloading model.safetensors: 100%|███████████████████████████████| 133M/133M [00:03<00:00, 39.5MB/s]
 Downloading (…)okenizer_config.json: 100%|███████████████████████████| 366/366 [00:00<00:00, 2.79MB/s]
 Downloading (…)solve/main/vocab.txt: 100%|█████████████████████████| 232k/232k [00:00<00:00, 6.00MB/s]
 Downloading (…)/main/tokenizer.json: 100%|█████████████████████████| 711k/711k [00:00<00:00, 18.8MB/s]
 Downloading (…)cial_tokens_map.json: 100%|███████████████████████████| 125/125 [00:00<00:00, 1.18MB/s]
 LocalAI is a community-driven project that aims to make AI accessible to everyone. It was created by Ettore Di Giacinto and is focused on providing various AI-related features such as text generation with GPTs, text to audio, audio to text, image generation, and more. The project is constantly growing and evolving, with a roadmap for future improvements. Anyone is welcome to contribute, provide feedback, and submit pull requests to help make LocalAI better.
 ```
--- a/examples/llamaindex/main.py
+++ b/examples/llamaindex/main.py
@@ -0,0 +1,38 @@
 import os
 import weaviate
 from llama_index import ServiceContext, VectorStoreIndex, StorageContext
 from llama_index.llms import LocalAI
 from llama_index.vector_stores import WeaviateVectorStore
 from llama_index.storage.storage_context import StorageContext
 # Weaviate client setup
 client = weaviate.Client("http://weviate.default")
 # Weaviate vector store setup
 vector_store = WeaviateVectorStore(weaviate_client=client, index_name="AIChroma")
 # Storage context setup
 storage_context = StorageContext.from_defaults(vector_store=vector_store)
 # LocalAI setup
 llm = LocalAI(temperature=0, model_name="gpt-3.5-turbo", api_base="http://local-ai.default", api_key="stub")
 llm.globally_use_chat_completions = True;
 # Service context setup
 service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
 # Load index from stored vectors
 index = VectorStoreIndex.from_vector_store(
    vector_store,
    storage_context=storage_context,
    service_context=service_context
 )
 # Query engine setup
 query_engine = index.as_query_engine(similarity_top_k=1, vector_store_query_mode="hybrid")
 # Query example
 response = query_engine.query("What is LocalAI?")
 print(response)
--- a/examples/localai-webui/docker-compose.yml
+++ b/examples/localai-webui/docker-compose.yml
@@ -8,8 +8,6 @@ services:
      dockerfile: Dockerfile
    ports:
      - 8080:8080
    env_file:
      - .env
    volumes:
      - ./models:/models:cached
    command: ["/usr/bin/local-ai"]
--- a/examples/models/.gitignore
+++ b/examples/models/.gitignore
@@ -0,0 +1,7 @@
 # Ignore everything but predefined models
 *
 !.gitignore
 !completion.tmpl
 !embeddings.yaml
 !gpt4all.tmpl
 !gpt-3.5-turbo.yaml
--- a/examples/langchain-chroma/models/completion.tmpl
+++ b/examples/langchain-chroma/models/completion.tmpl
--- a/examples/langchain-chroma/models/embeddings.yaml
+++ b/examples/langchain-chroma/models/embeddings.yaml
--- a/examples/chatbot-ui-manual/models/gpt-3.5-turbo.yaml
+++ b/examples/chatbot-ui-manual/models/gpt-3.5-turbo.yaml
--- a/examples/chatbot-ui-manual/models/gpt4all.tmpl
+++ b/examples/chatbot-ui-manual/models/gpt4all.tmpl
--- a/examples/query_data/models
+++ b/examples/query_data/models
@@ -0,0 +1 @@
 ../models
--- a/examples/query_data/models/completion.tmpl
+++ b/examples/query_data/models/completion.tmpl
@@ -1 +0,0 @@
 {{.Input}}
--- a/examples/query_data/models/embeddings.yaml
+++ b/examples/query_data/models/embeddings.yaml
@@ -1,6 +0,0 @@
 name: text-embedding-ada-002
 parameters:
  model: bert
 threads: 14
 backend: bert-embeddings
 embeddings: true
--- a/examples/query_data/models/gpt-3.5-turbo.yaml
+++ b/examples/query_data/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
 name: gpt-3.5-turbo
 parameters:
  model: ggml-gpt4all-j
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 context_size: 1024
 stopwords:
 - "HUMAN:"
 - "GPT:"
 roles:
  user: " "
  system: " "
 template:
  completion: completion
  chat: gpt4all
--- a/examples/slack-bot/.env.example
+++ b/examples/slack-bot/.env.example
@@ -1,3 +1,6 @@
 # CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
 # GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
 SLACK_APP_TOKEN=xapp-1-...
 SLACK_BOT_TOKEN=xoxb-...
 OPENAI_API_KEY=sk-...
--- a/examples/slack-bot/README.md
+++ b/examples/slack-bot/README.md
@@ -18,7 +18,7 @@ git clone https://github.com/seratch/ChatGPT-in-Slack
 # Download gpt4all-j to models/
 wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
-# Set the discord bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
+# Set the Slack bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
 cp -rfv .env.example .env
 vim .env
--- a/Show More
+++ b/Show More