fix(deps): update github.com/tmc/langchaingo digest to b33244e

2026-02-03 03:02:38 -05:00 · 2023-10-20 21:12:53 +00:00
26 changed files with 518 additions and 375 deletions
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -44,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -24,16 +24,42 @@ jobs:
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11'
+            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12'
+            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg'
+            ffmpeg: 'true'
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'

    runs-on: ubuntu-latest
    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
      - name: Release space from worker
        run: |
          echo "Listing top largest packages"
@@ -57,10 +83,6 @@ jobs:
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get remove -y microsoft-edge-stable || true
-          sudo apt-get remove -y firefox || true
-          sudo apt-get remove -y powershell || true
-          sudo apt-get remove -y r-base-core || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
@@ -70,98 +92,8 @@ jobs:
          echo
          sudo rm -rfv build || true
          df -h
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: quay.io/go-skynet/local-ai
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=${{ matrix.tag-latest }}
-            suffix=${{ matrix.tag-suffix }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: quay.io
-          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            BUILD_TYPE=${{ matrix.build-type }}
-            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
-            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
-            FFMPEG=${{ matrix.ffmpeg }}
-          context: .
-          file: ./Dockerfile
-          platforms: ${{ matrix.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-
-
-  docker-gpu:
-    strategy:
-      matrix:
-        include:
-          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
-            ffmpeg: 'true'
-          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'
-
-    runs-on: arc-runner-set 
-    steps:
-      - name: Force Install GIT latest
-        run: |
-          sudo apt-get update \
-          && sudo apt-get install -y software-properties-common \
-          && sudo apt-get update \
-          && sudo add-apt-repository -y ppa:git-core/ppa \
-          && sudo apt-get update \
-          && sudo apt-get install -y git
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Docker meta
        id: meta
@@ -192,6 +124,7 @@ jobs:
          registry: quay.io
          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+
      - name: Build and push
        uses: docker/build-push-action@v5
        with:
@@ -207,7 +140,3 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-      - name: Release space from worker ♻
-        if: always()
-        run: |
-          docker system prune -f -a --volumes || true
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
      - uses: actions/setup-go@v4
@@ -66,7 +66,7 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
      - uses: actions/setup-go@v4
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -15,13 +15,13 @@ concurrency:

 jobs:
  ubuntu-latest:
-    runs-on: gpu
+    runs-on: self-hosted
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -40,8 +40,6 @@ jobs:
          if [ ! -e /run/systemd/system ]; then
            sudo mkdir /run/systemd/system
          fi
-          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
-          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            BUILD_TYPE=cublas \
@@ -59,5 +57,4 @@ jobs:
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            teardown-e2e || true
-          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
-          docker system prune -f -a --volumes || true
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,7 +53,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -108,7 +108,7 @@ jobs:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
--- a/20
+++ b/20
@@ -19,7 +19,7 @@ ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/i
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+    apt-get install -y ca-certificates curl patch pip cmake


 # Use the variables in subsequent instructions
@@ -34,18 +34,17 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

-# OpenBLAS requirements and stable diffusion
-RUN apt-get install -y \
-    libopenblas-dev \
-    libopencv-dev \ 
-    && apt-get clean
+# OpenBLAS requirements
+RUN apt-get install -y libopenblas-dev
+
+# Stable Diffusion requirements
+RUN apt-get install -y libopencv-dev && \
+    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

-# Set up OpenCV
-RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

 WORKDIR /build

@@ -69,7 +68,8 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ && \
-    rm spdlog-${SPDLOG_VERSION} -rf
+    rm spdlog-${SPDLOG_VERSION} -rf && \
+    rm /build/lib/Linux-$(uname -m)/piper_phonemize -rf

 # Extras requirements
 FROM requirements-core as requirements-extras
--- a/52
+++ b/52
@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
+GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=6e08281e588bbba1a5d180290a94a43f167f3a1a
+CPPLLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -30,9 +30,15 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
 PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7

+# go-bloomz version
+BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
+
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632

+# Go-ggllm
+GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
+
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
@@ -123,13 +129,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
-GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
-
-# If empty, then we build all
-ifeq ($(GRPC_BACKENDS),)
-	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
-endif
+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)

 .PHONY: all test build vendor

@@ -140,6 +140,14 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

+## go-ggllm
+go-ggllm:
+	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
+	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
+
+go-ggllm/libggllm.a: go-ggllm
+	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
+
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -166,6 +174,14 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

+## bloomz
+bloomz:
+	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
+	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
+
+bloomz/libbloomz.a: bloomz
+	cd bloomz && make libbloomz.a
+
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a

@@ -219,7 +235,7 @@ go-llama-stable/libbinding.a: go-llama-stable
 go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
 	touch $@

 replace:
@@ -228,8 +244,10 @@ replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
+	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
+	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -245,7 +263,9 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
+	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
+	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -263,8 +283,10 @@ clean: ## Remove build related file
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
+	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
+	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	$(MAKE) -C backend/cpp/llama clean
@@ -292,7 +314,7 @@ test-models/testmodel:
 	mkdir test-dir
 	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
 	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
+	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
 	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
 	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
@@ -390,6 +412,10 @@ protogen-python:
 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

+backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
+
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -454,6 +480,10 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/

+backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
+
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -457,7 +457,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "960s", "10s").Should(Equal(true))
+				}, "360s", "10s").Should(Equal(true))

 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -687,7 +687,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred(), err)
+			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))

--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -4,11 +4,6 @@ set(TARGET grpc-server)
 set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 set(_REFLECTION grpc++_reflection)

-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    link_directories("/opt/homebrew/lib")
-    include_directories("/opt/homebrew/include")
-endif()
-
 find_package(absl CONFIG REQUIRED)
 find_package(Protobuf CONFIG REQUIRED)
 find_package(gRPC CONFIG REQUIRED)
@@ -20,7 +15,8 @@ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${Protobuf_INCLUDE_DIRS})

-message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+message(STATUS "Using protobuf ${Protobuf_VERSION} ${Protobuf_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}")
+

 # Proto file
 get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=
+LLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda

 CMAKE_ARGS?=
 BUILD_TYPE?=
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -88,7 +88,6 @@ static size_t find_partial_stop_string(const std::string &stop,
    return std::string::npos;
 }

-
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 {
@@ -129,16 +128,18 @@ struct llama_server_context
    size_t n_past = 0;
    size_t n_remain = 0;

+   // json prompt;
    std::vector<llama_token> embd;
-
-    gpt_params params;
+    std::vector<llama_token> last_n_tokens;

    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
-    llama_sampling_context *ctx_sampling = nullptr;
-
+    gpt_params params;
    int n_ctx;

+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;
+
    bool truncated = false;
    bool stopped_eos = false;
    bool stopped_word = false;
@@ -170,7 +171,7 @@ struct llama_server_context
    void rewind()
    {
        params.antiprompt.clear();
-        params.sparams.grammar.clear();
+        params.grammar.clear();
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
@@ -184,87 +185,100 @@ struct llama_server_context
        multibyte_pending = 0;
        n_remain = 0;
        n_past = 0;
-        params.sparams.n_prev = n_ctx;
-    }

-    void initSampling() {
-        if (ctx_sampling != nullptr) {
-            llama_sampling_free(ctx_sampling);
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
        }
-        ctx_sampling = llama_sampling_init(params.sparams);
    }

    bool loadModel(const gpt_params &params_)
    {
+                    printf("load model %s\n", params_.model.c_str());
+
        params = params_;
        std::tie(model, ctx) = llama_init_from_gpt_params(params);
        if (model == nullptr)
        {
+            printf("unable to load model %s\n", params_.model.c_str());
            return false;
        }
        n_ctx = llama_n_ctx(ctx);
+        last_n_tokens.resize(n_ctx);
+        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
        return true;
    }
-    std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const {
-       // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+
+     std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
-        std::vector<llama_token> prompt_tokens; 
-        auto s = std::string(prompt);
-        prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+        std::vector<llama_token> prompt_tokens;
+
+  
+        bool first = true;
+        // Iterate over prompts
+        for (const char **p = prompts; *p != nullptr; ++p)
+        {
+            auto s = std::string(*p);
+            std::vector<llama_token> pp;
+            if (first)
+            {
+                pp = ::llama_tokenize(ctx, s, add_bos);
+                first = false;
+            }
+            else
+            {
+                pp = ::llama_tokenize(ctx, s, false);
+            }
+            prompt_tokens.insert(prompt_tokens.end(), pp.begin(), pp.end());
+        }
+        
+
        return prompt_tokens;
    }
-     std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const {
-                std::vector<llama_token> prompt_tokens; 

-            bool first = true;
-            bool is_string = true;
-            for (const char **p = prompts; *p != nullptr; ++p)
-              {
-                if (is_string)
-                {
-                    auto s = std::string(*p);
-                    std::vector<llama_token> p;
-                    if (first)
-                    {
-                        p = ::llama_tokenize(ctx, s, add_bos);
-                        first = false;
-                    }
-                    else
-                    {
-                        p = ::llama_tokenize(ctx, s, false);
-                    }
-                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-                }
-                else
-                {
-                    if (first)
-                    {
-                        first = false;
-                    }
-                    //prompt_tokens.push_back(p.template get<llama_token>());
+    std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+  
+        auto s = std::string(prompt);
+        prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+    
+        return prompt_tokens;
+    }
+
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                printf("grammar parse error");
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    printf("EOS token is disabled, which will cause most grammars to fail");
                }
            }
-            return prompt_tokens;
-     }

-    void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
-        const int n_left = n_ctx - params.n_keep;
-        const int n_block_size = n_left / 2;
-        const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
-
-        // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
-        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-
-        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-
-        truncated = true;
-        prompt_tokens = new_tokens;
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
    }

    void loadInfill()
    {
        bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
            params.input_suffix.erase(0, 1);
            suff_rm_leading_spc = false;
        }
@@ -275,12 +289,11 @@ struct llama_server_context
        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
            suffix_tokens.erase(suffix_tokens.begin());
        }
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
+        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-        prefix_tokens.push_back(llama_token_middle(model));
-
+        prefix_tokens.push_back(llama_token_middle(ctx));
        auto prompt_tokens = prefix_tokens;

        num_prompt_tokens = prompt_tokens.size();
@@ -292,24 +305,29 @@ struct llama_server_context
        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);

        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t) n_ctx)
+        if (num_prompt_tokens >= (size_t)params.n_ctx)
        {
-            truncatePrompt(prompt_tokens);
-            num_prompt_tokens = prompt_tokens.size();
+            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
+            // todo we probably want to cut from both sides
+            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());

-            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+            truncated = true;
+            prompt_tokens = new_tokens;
        }
-
-        // push the prompt into the sampling context (do not apply grammar)
-        for (auto & token : prompt_tokens)
+        else
        {
-            llama_sampling_accept(ctx_sampling, ctx, token, false);
+            const size_t ps = num_prompt_tokens;
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
        }

        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
        embd = prompt_tokens;
-
        if (n_past == num_prompt_tokens)
        {
            // we have to evaluate at least 1 token to generate logits.
@@ -317,7 +335,6 @@ struct llama_server_context
            n_past--;
        }

-        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);

        has_next_token = true;
@@ -335,33 +352,38 @@ struct llama_server_context
        params.n_keep = std::min(n_ctx - 4, params.n_keep);

        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t) n_ctx)
+        if (num_prompt_tokens >= (size_t)n_ctx)
        {
-            truncatePrompt(prompt_tokens);
-            num_prompt_tokens = prompt_tokens.size();
+            const int n_left = (n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());

-            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+
+            truncated = true;
+            prompt_tokens = new_tokens;
        }
-
-        // push the prompt into the sampling context (do not apply grammar)
-        for (auto & token : prompt_tokens)
+        else
        {
-            llama_sampling_accept(ctx_sampling, ctx, token, false);
+            const size_t ps = num_prompt_tokens;
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
        }

        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);

+
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
            // we have to evaluate at least 1 token to generate logits.
            n_past--;
        }
-
        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);

+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        has_next_token = true;
    }

@@ -396,6 +418,7 @@ struct llama_server_context
            n_past -= n_discard;

            truncated = true;
+       
        }

        bool tg = true;
@@ -410,6 +433,7 @@ struct llama_server_context

            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
            {
+             
                has_next_token = false;
                return result;
            }
@@ -419,30 +443,33 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos(model);
+            result.tok = llama_token_eos(ctx);
            return result;
        }

        {
            // out of user input, sample next token
-            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(llama_n_vocab(model));

-            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
+            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);

-            const int32_t n_probs = params.sparams.n_probs;
-            if (params.sparams.temp <= 0 && n_probs > 0)
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int32_t n_probs = params.n_probs;
+            if (params.temp <= 0 && n_probs > 0)
            {
                // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &cur_p);
+                llama_sample_softmax(ctx, &candidates_p);
            }

-            for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
            {
-                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
            }

-            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
-
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(result.tok);
            if (tg) {
                num_tokens_predicted++;
            }
@@ -453,7 +480,7 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;

-        if (!embd.empty() && embd.back() == llama_token_eos(model))
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
            // stopping_word = llama_token_to_piece(ctx, embd.back());
            has_next_token = false;
@@ -504,7 +531,7 @@ struct llama_server_context
        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;

-        if (params.sparams.n_probs > 0)
+        if (params.n_probs > 0)
        {
            generated_token_probs.push_back(token_with_probs);
        }
@@ -556,6 +583,7 @@ struct llama_server_context
        static const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
+            printf("embedding disabled");
            return std::vector<float>(n_embd, 0.0f);
        }
        const float *data = llama_get_embeddings(ctx);
@@ -571,30 +599,30 @@ static void parse_options_completion(bool streaming,const backend::PredictOption

    llama.stream = streaming;
    llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
-    llama.params.sparams.top_k = predict->topk();
-    llama.params.sparams.top_p = predict->topp();
-    llama.params.sparams.tfs_z = predict->tailfreesamplingz();
-    llama.params.sparams.typical_p = predict->typicalp();
-    llama.params.sparams.penalty_last_n = predict->repeat();
-    llama.params.sparams.temp = predict->temperature();
-    llama.params.sparams.penalty_repeat = predict->penalty();
-    llama.params.sparams.penalty_present = predict->presencepenalty();
-    llama.params.sparams.penalty_freq = predict->frequencypenalty();
-    llama.params.sparams.mirostat = predict->mirostat();
-    llama.params.sparams.mirostat_tau = predict->mirostattau();
-    llama.params.sparams.mirostat_eta = predict->mirostateta();
-    llama.params.sparams.penalize_nl = predict->penalizenl();
+    llama.params.top_k = predict->topk();
+    llama.params.top_p = predict->topp();
+    llama.params.tfs_z = predict->tailfreesamplingz();
+    llama.params.typical_p = predict->typicalp();
+    llama.params.repeat_last_n = predict->repeat();
+    llama.params.temp = predict->temperature();
+    llama.params.repeat_penalty = predict->penalty();
+    llama.params.presence_penalty = predict->presencepenalty();
+    llama.params.frequency_penalty = predict->frequencypenalty();
+    llama.params.mirostat = predict->mirostat();
+    llama.params.mirostat_tau = predict->mirostattau();
+    llama.params.mirostat_eta = predict->mirostateta();
+    llama.params.penalize_nl = predict->penalizenl();
    llama.params.n_keep = predict->nkeep();
    llama.params.seed = predict->seed();
-    llama.params.sparams.grammar = predict->grammar();
+    llama.params.grammar = predict->grammar();
    // llama.params.n_probs = predict->
    llama.params.prompt = predict->prompt();

-    llama.params.sparams.logit_bias.clear();
+    llama.params.logit_bias.clear();

    if (predict->ignoreeos())
    {
-        llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }

    // const auto &logit_bias = body.find("logit_bias");
@@ -676,7 +704,7 @@ static void params_parse(const backend::ModelOptions* request,
 }

 static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
 }

 // Function matching type llama_beam_search_callback_fn_t.
@@ -773,7 +801,12 @@ public:

        parse_options_completion(false, request, llama);

-        llama.initSampling();
+        if (!llama.loadGrammar())
+        {
+            //res.status = 400;
+            return Status::CANCELLED;
+        }
+
        llama.loadPrompt(request->prompt());
        llama.beginCompletion();
        size_t sent_count = 0;
@@ -815,7 +848,7 @@ public:

                std::vector<completion_token_output> probs_output = {};

-                if (llama.params.sparams.n_probs > 0) {
+                if (llama.params.n_probs > 0) {
                    const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                    size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                    size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -846,7 +879,12 @@ public:
        llama_reset_timings(llama.ctx);
        parse_options_completion(false, request, llama);

-        llama.initSampling();
+        if (!llama.loadGrammar())
+        {
+            //res.status = 400;
+            return Status::CANCELLED;
+        }
+
        llama.loadPrompt(request->prompt());
        llama.beginCompletion();

@@ -877,7 +915,7 @@ public:
        }

        auto probs = llama.generated_token_probs;
-        if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
+        if (llama.params.n_probs > 0 && llama.stopped_word) {
            const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
            probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
        }
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -0,0 +1,25 @@
+package main
+
+// GRPC Falcon server
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -1,25 +0,0 @@
-meta {
-  name: Generate image
-  type: http
-  seq: 1
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-    "prompt": "<positive prompt>|<negative prompt>",
-    "model": "model-name",
-    "step": 51,
-    "size": "1024x1024",
-    "image": ""
-  }
-}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -15,16 +15,10 @@ headers {
 }

 body:json {
-  {
-    "model": "{{DEFAULT_MODEL}}",
-    "messages": [
-      {
-        "role": "user",
-        "content": "How could one use friction to cook an egg?"
-      }
-    ],
-    "max_tokens": 256,
-    "temperature": 0.2,
-    "grammar": ""
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "user", "content": "How could one use friction to cook an egg?"}],
+       "max_tokens": 256,
+       "temperature": 0.2
  }
 }
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -1,42 +0,0 @@
-## Advanced configuration
-
-This section contains examples on how to install models manually with config files.
-
-### Prerequisites
-
-First clone LocalAI:
-
-```bash
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-```
-
-Setup the model you prefer from the examples below and then start LocalAI:
-
-```bash
-docker compose up -d --pull always
-```
-
-If LocalAI is already started, you can restart it with 
-
-```bash
-docker compose restart
-```
-
-See also the getting started: https://localai.io/basics/getting_started/
-
-### Mistral
-
-To setup mistral copy the files inside `mistral` in the `models` folder:
-
-```bash
-cp -r examples/configurations/mistral/* models/
-```
-
-Now download the model:
-
-```bash
-wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
-```
-
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -1,3 +0,0 @@
-{{.Input}}
-<|im_start|>assistant
-
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -1,3 +0,0 @@
-<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
-{{if .Content}}{{.Content}}{{end}}
-<|im_end|>
--- a/examples/configurations/mistral/completion.tmpl
+++ b/examples/configurations/mistral/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/configurations/mistral/mistral.yaml
+++ b/examples/configurations/mistral/mistral.yaml
@@ -1,16 +0,0 @@
-name: mistral
-mmap: true
-parameters:
-  model: mistral-7b-openorca.Q6_K.gguf
-  temperature: 0.2
-  top_k: 40
-  top_p: 0.95
-template:
-  chat_message: chatml
-  chat: chatml-block
-  completion: completion
-context_size: 4096
-f16: true
-stopwords:
- <|im_end|>
-threads: 4
--- a/go.mod
+++ b/go.mod
@@ -20,16 +20,16 @@ require (
 	github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef
 	github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c
 	github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af
-	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530
+	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84
 	github.com/onsi/ginkgo/v2 v2.13.0
-	github.com/onsi/gomega v1.28.1
+	github.com/onsi/gomega v1.28.0
 	github.com/otiai10/openaigo v1.6.0
 	github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
 	github.com/prometheus/client_golang v1.17.0
 	github.com/rs/zerolog v1.31.0
 	github.com/sashabaranov/go-openai v1.16.0
 	github.com/schollz/progressbar/v3 v3.13.1
-	github.com/tmc/langchaingo v0.0.0-20231019140956-c636b3da7701
+	github.com/tmc/langchaingo v0.0.0-20231020205806-b33244eb8de8
 	github.com/urfave/cli/v2 v2.25.7
 	github.com/valyala/fasthttp v1.50.0
 	go.opentelemetry.io/otel v1.19.0
@@ -89,7 +89,7 @@ require (
 	github.com/go-audio/riff v1.0.0 // indirect
 	github.com/go-logr/logr v1.2.4 // indirect
 	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
-	github.com/google/go-cmp v0.6.0 // indirect
+	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
 	github.com/hashicorp/errwrap v1.0.0 // indirect
 	github.com/klauspost/compress v1.16.7 // indirect
--- a/go.sum
+++ b/go.sum
@@ -73,8 +73,6 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
@@ -137,8 +135,6 @@ github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231013181651-22de3c
 github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231013181651-22de3c56bdd4/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
 github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84 h1:AiFzd+M2Uxz67fdn4nCnKR70me5yf88rXhoqhvfRDak=
 github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 h1:YXMxHwHMB9jCBo2Yu5gz3mTB3T1TnZs/HmPLv15LUSA=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
@@ -155,8 +151,6 @@ github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1y
 github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
 github.com/onsi/gomega v1.28.0 h1:i2rg/p9n/UqIDAMFUJ6qIUUMcsqOuUHgbpbu235Vr1c=
 github.com/onsi/gomega v1.28.0/go.mod h1:A1H2JE76sI14WIP57LMKj7FVfCHx3g3BcZVjJG8bjX8=
-github.com/onsi/gomega v1.28.1 h1:MijcGUbfYuznzK/5R4CPNoUP/9Xvuo20sXfEm6XxoTA=
-github.com/onsi/gomega v1.28.1/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
 github.com/otiai10/mint v1.6.1 h1:kgbTJmOpp/0ce7hk3H8jiSuR0MXmpwWRfqUdKww17qg=
 github.com/otiai10/mint v1.6.1/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
 github.com/otiai10/openaigo v1.6.0 h1:YTQEbtDSvawETOB/Kmb/6JvuHdHH/eIpSQfHVufiwY8=
@@ -220,6 +214,8 @@ github.com/tmc/langchaingo v0.0.0-20231016073620-a02d4fdc0f3a h1:BziGpoF5ZVWMDy6
 github.com/tmc/langchaingo v0.0.0-20231016073620-a02d4fdc0f3a/go.mod h1:SiwyRS7sBSSi6f3NB4dKENw69X6br/wZ2WRkM+8pZWk=
 github.com/tmc/langchaingo v0.0.0-20231019140956-c636b3da7701 h1:LquLgmFiKf6eDXdwoUKCIGn5NsR34cLXC6ySYhiE6bA=
 github.com/tmc/langchaingo v0.0.0-20231019140956-c636b3da7701/go.mod h1:SiwyRS7sBSSi6f3NB4dKENw69X6br/wZ2WRkM+8pZWk=
+github.com/tmc/langchaingo v0.0.0-20231020205806-b33244eb8de8 h1:LJ/dRV4AZfcrF/BYRmeXUd/MrVb36qFIFRJO+01TmMM=
+github.com/tmc/langchaingo v0.0.0-20231020205806-b33244eb8de8/go.mod h1:SiwyRS7sBSSi6f3NB4dKENw69X6br/wZ2WRkM+8pZWk=
 github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/ulikunitz/xz v0.5.9 h1:RsKRIA2MO8x56wkkcd3LbtcE/uMszhb6DpRf+3uwa3I=
 github.com/ulikunitz/xz v0.5.9/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
--- a/pkg/backend/llm/bloomz/bloomz.go
+++ b/pkg/backend/llm/bloomz/bloomz.go
@@ -0,0 +1,59 @@
+package bloomz
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/bloomz.cpp"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	bloomz *bloomz.Bloomz
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	model, err := bloomz.New(opts.ModelFile)
+	llm.bloomz = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []bloomz.PredictOption {
+	predictOptions := []bloomz.PredictOption{
+		bloomz.SetTemperature(float64(opts.Temperature)),
+		bloomz.SetTopP(float64(opts.TopP)),
+		bloomz.SetTopK(int(opts.TopK)),
+		bloomz.SetTokens(int(opts.Tokens)),
+		bloomz.SetThreads(int(opts.Threads)),
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, bloomz.SetSeed(int(opts.Seed)))
+	}
+
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+// fallback to Predict
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	go func() {
+		res, err := llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		results <- res
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/backend/llm/falcon/falcon.go
+++ b/pkg/backend/llm/falcon/falcon.go
@@ -0,0 +1,145 @@
+package falcon
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	ggllm "github.com/mudler/go-ggllm.cpp"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	falcon *ggllm.Falcon
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	ggllmOpts := []ggllm.ModelOption{}
+	if opts.ContextSize != 0 {
+		ggllmOpts = append(ggllmOpts, ggllm.SetContext(int(opts.ContextSize)))
+	}
+	// F16 doesn't seem to produce good output at all!
+	//if c.F16 {
+	//	llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	//}
+
+	if opts.NGPULayers != 0 {
+		ggllmOpts = append(ggllmOpts, ggllm.SetGPULayers(int(opts.NGPULayers)))
+	}
+
+	ggllmOpts = append(ggllmOpts, ggllm.SetMMap(opts.MMap))
+	ggllmOpts = append(ggllmOpts, ggllm.SetMainGPU(opts.MainGPU))
+	ggllmOpts = append(ggllmOpts, ggllm.SetTensorSplit(opts.TensorSplit))
+	if opts.NBatch != 0 {
+		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(int(opts.NBatch)))
+	} else {
+		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(512))
+	}
+
+	model, err := ggllm.New(opts.ModelFile, ggllmOpts...)
+	llm.falcon = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []ggllm.PredictOption {
+	predictOptions := []ggllm.PredictOption{
+		ggllm.SetTemperature(float64(opts.Temperature)),
+		ggllm.SetTopP(float64(opts.TopP)),
+		ggllm.SetTopK(int(opts.TopK)),
+		ggllm.SetTokens(int(opts.Tokens)),
+		ggllm.SetThreads(int(opts.Threads)),
+	}
+
+	if opts.PromptCacheAll {
+		predictOptions = append(predictOptions, ggllm.EnablePromptCacheAll)
+	}
+
+	if opts.PromptCacheRO {
+		predictOptions = append(predictOptions, ggllm.EnablePromptCacheRO)
+	}
+
+	// Expected absolute path
+	if opts.PromptCachePath != "" {
+		predictOptions = append(predictOptions, ggllm.SetPathPromptCache(opts.PromptCachePath))
+	}
+
+	if opts.Mirostat != 0 {
+		predictOptions = append(predictOptions, ggllm.SetMirostat(int(opts.Mirostat)))
+	}
+
+	if opts.MirostatETA != 0 {
+		predictOptions = append(predictOptions, ggllm.SetMirostatETA(float64(opts.MirostatETA)))
+	}
+
+	if opts.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, ggllm.SetMirostatTAU(float64(opts.MirostatTAU)))
+	}
+
+	if opts.Debug {
+		predictOptions = append(predictOptions, ggllm.Debug)
+	}
+
+	predictOptions = append(predictOptions, ggllm.SetStopWords(opts.StopPrompts...))
+
+	if opts.PresencePenalty != 0 {
+		predictOptions = append(predictOptions, ggllm.SetPenalty(float64(opts.PresencePenalty)))
+	}
+
+	if opts.NKeep != 0 {
+		predictOptions = append(predictOptions, ggllm.SetNKeep(int(opts.NKeep)))
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, ggllm.SetBatch(int(opts.Batch)))
+	}
+
+	if opts.IgnoreEOS {
+		predictOptions = append(predictOptions, ggllm.IgnoreEOS)
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, ggllm.SetSeed(int(opts.Seed)))
+	}
+
+	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+	predictOptions = append(predictOptions, ggllm.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
+	predictOptions = append(predictOptions, ggllm.SetMlock(opts.MLock))
+	predictOptions = append(predictOptions, ggllm.SetMemoryMap(opts.MMap))
+	predictOptions = append(predictOptions, ggllm.SetPredictionMainGPU(opts.MainGPU))
+	predictOptions = append(predictOptions, ggllm.SetPredictionTensorSplit(opts.TensorSplit))
+	predictOptions = append(predictOptions, ggllm.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
+	predictOptions = append(predictOptions, ggllm.SetTypicalP(float64(opts.TypicalP)))
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+
+	predictOptions := buildPredictOptions(opts)
+
+	predictOptions = append(predictOptions, ggllm.SetTokenCallback(func(token string) bool {
+		if token == "<|endoftext|>" {
+			return true
+		}
+		results <- token
+		return true
+	}))
+
+	go func() {
+		_, err := llm.falcon.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/gallery/gallery.go
+++ b/pkg/gallery/gallery.go
@@ -8,7 +8,6 @@ import (

 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/imdario/mergo"
-	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v2"
 )

@@ -167,9 +166,7 @@ func getGalleryModels(gallery Gallery, basePath string) ([]*GalleryModel, error)
 		return yaml.Unmarshal(d, &models)
 	})
 	if err != nil {
-		if yamlErr, ok := err.(*yaml.TypeError); ok {
-			log.Debug().Msgf("YAML errors: %s\n\nwreckage of models: %+v", strings.Join(yamlErr.Errors, "\n"), models)
-		}
+
 		return models, err
 	}

--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -18,6 +18,7 @@ const (
 	LlamaBackend        = "llama"
 	LlamaStableBackend  = "llama-stable"
 	LLamaCPP            = "llama-cpp"
+	BloomzBackend       = "bloomz"
 	StarcoderBackend    = "starcoder"
 	GPTJBackend         = "gptj"
 	DollyBackend        = "dolly"
@@ -29,6 +30,7 @@ const (
 	Gpt4AllMptBackend   = "gpt4all-mpt"
 	Gpt4AllJBackend     = "gpt4all-j"
 	Gpt4All             = "gpt4all"
+	FalconBackend       = "falcon"
 	FalconGGMLBackend   = "falcon-ggml"

 	BertEmbeddingsBackend  = "bert-embeddings"
@@ -44,6 +46,7 @@ var AutoLoadBackends []string = []string{
 	LlamaStableBackend,
 	LlamaBackend,
 	Gpt4All,
+	FalconBackend,
 	GPTNeoXBackend,
 	BertEmbeddingsBackend,
 	FalconGGMLBackend,
@@ -53,6 +56,7 @@ var AutoLoadBackends []string = []string{
 	MPTBackend,
 	ReplitBackend,
 	StarcoderBackend,
+	BloomzBackend,
 	RwkvBackend,
 	WhisperBackend,
 	StableDiffusionBackend,