⬆️ Update ggerganov/llama.cpp (#1207 )

Signed-off-by: GitHub <noreply@github.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>
ci: enlarge download timeout window
2026-02-03 03:02:38 -05:00 · 2023-10-30 08:00:36 +00:00 · 2023-10-29 22:09:35 +01:00 · 2023-10-29 22:04:43 +01:00 · 2023-10-26 21:13:40 +02:00 · 2023-10-26 07:43:31 +02:00
15 changed files with 182 additions and 339 deletions
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -24,42 +24,16 @@ jobs:
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
-          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
-            ffmpeg: 'true'
-          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'

    runs-on: ubuntu-latest
    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
      - name: Release space from worker
        run: |
          echo "Listing top largest packages"
@@ -96,9 +70,6 @@ jobs:
          echo
          sudo rm -rfv build || true
          df -h
-      - name: Checkout
-        uses: actions/checkout@v4
-
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
@@ -144,3 +115,99 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
+
+
+  docker-gpu:
+    strategy:
+      matrix:
+        include:
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11'
+            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12'
+            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg'
+            ffmpeg: 'true'
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+
+    runs-on: arc-runner-set 
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ matrix.tag-latest }}
+            suffix=${{ matrix.tag-suffix }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ matrix.build-type }}
+            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
+            FFMPEG=${{ matrix.ffmpeg }}
+          context: .
+          file: ./Dockerfile
+          platforms: ${{ matrix.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -15,7 +15,7 @@ concurrency:

 jobs:
  ubuntu-latest:
-    runs-on: self-hosted
+    runs-on: gpu
    strategy:
      matrix:
        go-version: ['1.21.x']
@@ -40,6 +40,8 @@ jobs:
          if [ ! -e /run/systemd/system ]; then
            sudo mkdir /run/systemd/system
          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            BUILD_TYPE=cublas \
@@ -57,4 +59,5 @@ jobs:
          make \
            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
            teardown-e2e || true
-          docker system prune -f -a --volumes || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
+          docker system prune -f -a --volumes || true
--- a/44
+++ b/44
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=96981f37b1e3f450d9e63e571514217bf60f0a7f
+CPPLLAMA_VERSION?=6e08281e588bbba1a5d180290a94a43f167f3a1a

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -30,15 +30,9 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
 PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7

-# go-bloomz version
-BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
-
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632

-# Go-ggllm
-GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
-
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
@@ -129,7 +123,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)

 # If empty, then we build all
@@ -146,14 +140,6 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

-## go-ggllm
-go-ggllm:
-	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
-	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
-
-go-ggllm/libggllm.a: go-ggllm
-	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
-
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -180,14 +166,6 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-## bloomz
-bloomz:
-	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
-	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
-
-bloomz/libbloomz.a: bloomz
-	cd bloomz && make libbloomz.a
-
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a

@@ -241,7 +219,7 @@ go-llama-stable/libbinding.a: go-llama-stable
 go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@

 replace:
@@ -250,10 +228,8 @@ replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
-	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
-	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -269,9 +245,7 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
-	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
-	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -289,10 +263,8 @@ clean: ## Remove build related file
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
-	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
-	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	$(MAKE) -C backend/cpp/llama clean
@@ -320,7 +292,7 @@ test-models/testmodel:
 	mkdir test-dir
 	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
 	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
+	wget https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
 	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
@@ -418,10 +390,6 @@ protogen-python:
 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
-
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -486,10 +454,6 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/

-backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
-
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -457,7 +457,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "960s", "10s").Should(Equal(true))

 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -687,7 +687,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred())
+			Expect(err).ToNot(HaveOccurred(), err)
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))

--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -275,11 +275,11 @@ struct llama_server_context
        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
            suffix_tokens.erase(suffix_tokens.begin());
        }
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
-        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-        prefix_tokens.push_back(llama_token_middle(ctx));
+        prefix_tokens.push_back(llama_token_middle(model));

        auto prompt_tokens = prefix_tokens;

@@ -419,7 +419,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos(ctx);
+            result.tok = llama_token_eos(model);
            return result;
        }

@@ -453,7 +453,7 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;

-        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
+        if (!embd.empty() && embd.back() == llama_token_eos(model))
        {
            // stopping_word = llama_token_to_piece(ctx, embd.back());
            has_next_token = false;
@@ -594,7 +594,7 @@ static void parse_options_completion(bool streaming,const backend::PredictOption

    if (predict->ignoreeos())
    {
-        llama.params.sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
+        llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
    }

    // const auto &logit_bias = body.find("logit_bias");
@@ -676,7 +676,7 @@ static void params_parse(const backend::ModelOptions* request,
 }

 static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model);
 }

 // Function matching type llama_beam_search_callback_fn_t.
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -1,25 +0,0 @@
-package main
-
-// GRPC Falcon server
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -0,0 +1,42 @@
+## Advanced configuration
+
+This section contains examples on how to install models manually with config files.
+
+### Prerequisites
+
+First clone LocalAI:
+
+```bash
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+```
+
+Setup the model you prefer from the examples below and then start LocalAI:
+
+```bash
+docker compose up -d --pull always
+```
+
+If LocalAI is already started, you can restart it with 
+
+```bash
+docker compose restart
+```
+
+See also the getting started: https://localai.io/basics/getting_started/
+
+### Mistral
+
+To setup mistral copy the files inside `mistral` in the `models` folder:
+
+```bash
+cp -r examples/configurations/mistral/* models/
+```
+
+Now download the model:
+
+```bash
+wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
+```
+
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+<|im_start|>assistant
+
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -0,0 +1,3 @@
+<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+{{if .Content}}{{.Content}}{{end}}
+<|im_end|>
--- a/examples/configurations/mistral/completion.tmpl
+++ b/examples/configurations/mistral/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/examples/configurations/mistral/mistral.yaml
+++ b/examples/configurations/mistral/mistral.yaml
@@ -0,0 +1,16 @@
+name: mistral
+mmap: true
+parameters:
+  model: mistral-7b-openorca.Q6_K.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+template:
+  chat_message: chatml
+  chat: chatml-block
+  completion: completion
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+threads: 4
--- a/pkg/backend/llm/bloomz/bloomz.go
+++ b/pkg/backend/llm/bloomz/bloomz.go
@@ -1,59 +0,0 @@
-package bloomz
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	"github.com/go-skynet/bloomz.cpp"
-)
-
-type LLM struct {
-	base.SingleThread
-
-	bloomz *bloomz.Bloomz
-}
-
-func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	model, err := bloomz.New(opts.ModelFile)
-	llm.bloomz = model
-	return err
-}
-
-func buildPredictOptions(opts *pb.PredictOptions) []bloomz.PredictOption {
-	predictOptions := []bloomz.PredictOption{
-		bloomz.SetTemperature(float64(opts.Temperature)),
-		bloomz.SetTopP(float64(opts.TopP)),
-		bloomz.SetTopK(int(opts.TopK)),
-		bloomz.SetTokens(int(opts.Tokens)),
-		bloomz.SetThreads(int(opts.Threads)),
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, bloomz.SetSeed(int(opts.Seed)))
-	}
-
-	return predictOptions
-}
-
-func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-
-	return nil
-}
--- a/pkg/backend/llm/falcon/falcon.go
+++ b/pkg/backend/llm/falcon/falcon.go
@@ -1,145 +0,0 @@
-package falcon
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	ggllm "github.com/mudler/go-ggllm.cpp"
-)
-
-type LLM struct {
-	base.SingleThread
-
-	falcon *ggllm.Falcon
-}
-
-func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	ggllmOpts := []ggllm.ModelOption{}
-	if opts.ContextSize != 0 {
-		ggllmOpts = append(ggllmOpts, ggllm.SetContext(int(opts.ContextSize)))
-	}
-	// F16 doesn't seem to produce good output at all!
-	//if c.F16 {
-	//	llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	//}
-
-	if opts.NGPULayers != 0 {
-		ggllmOpts = append(ggllmOpts, ggllm.SetGPULayers(int(opts.NGPULayers)))
-	}
-
-	ggllmOpts = append(ggllmOpts, ggllm.SetMMap(opts.MMap))
-	ggllmOpts = append(ggllmOpts, ggllm.SetMainGPU(opts.MainGPU))
-	ggllmOpts = append(ggllmOpts, ggllm.SetTensorSplit(opts.TensorSplit))
-	if opts.NBatch != 0 {
-		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(int(opts.NBatch)))
-	} else {
-		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(512))
-	}
-
-	model, err := ggllm.New(opts.ModelFile, ggllmOpts...)
-	llm.falcon = model
-	return err
-}
-
-func buildPredictOptions(opts *pb.PredictOptions) []ggllm.PredictOption {
-	predictOptions := []ggllm.PredictOption{
-		ggllm.SetTemperature(float64(opts.Temperature)),
-		ggllm.SetTopP(float64(opts.TopP)),
-		ggllm.SetTopK(int(opts.TopK)),
-		ggllm.SetTokens(int(opts.Tokens)),
-		ggllm.SetThreads(int(opts.Threads)),
-	}
-
-	if opts.PromptCacheAll {
-		predictOptions = append(predictOptions, ggllm.EnablePromptCacheAll)
-	}
-
-	if opts.PromptCacheRO {
-		predictOptions = append(predictOptions, ggllm.EnablePromptCacheRO)
-	}
-
-	// Expected absolute path
-	if opts.PromptCachePath != "" {
-		predictOptions = append(predictOptions, ggllm.SetPathPromptCache(opts.PromptCachePath))
-	}
-
-	if opts.Mirostat != 0 {
-		predictOptions = append(predictOptions, ggllm.SetMirostat(int(opts.Mirostat)))
-	}
-
-	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, ggllm.SetMirostatETA(float64(opts.MirostatETA)))
-	}
-
-	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, ggllm.SetMirostatTAU(float64(opts.MirostatTAU)))
-	}
-
-	if opts.Debug {
-		predictOptions = append(predictOptions, ggllm.Debug)
-	}
-
-	predictOptions = append(predictOptions, ggllm.SetStopWords(opts.StopPrompts...))
-
-	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, ggllm.SetPenalty(float64(opts.PresencePenalty)))
-	}
-
-	if opts.NKeep != 0 {
-		predictOptions = append(predictOptions, ggllm.SetNKeep(int(opts.NKeep)))
-	}
-
-	if opts.Batch != 0 {
-		predictOptions = append(predictOptions, ggllm.SetBatch(int(opts.Batch)))
-	}
-
-	if opts.IgnoreEOS {
-		predictOptions = append(predictOptions, ggllm.IgnoreEOS)
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, ggllm.SetSeed(int(opts.Seed)))
-	}
-
-	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
-
-	predictOptions = append(predictOptions, ggllm.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
-	predictOptions = append(predictOptions, ggllm.SetMlock(opts.MLock))
-	predictOptions = append(predictOptions, ggllm.SetMemoryMap(opts.MMap))
-	predictOptions = append(predictOptions, ggllm.SetPredictionMainGPU(opts.MainGPU))
-	predictOptions = append(predictOptions, ggllm.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, ggllm.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
-	predictOptions = append(predictOptions, ggllm.SetTypicalP(float64(opts.TypicalP)))
-	return predictOptions
-}
-
-func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
-
-	predictOptions := buildPredictOptions(opts)
-
-	predictOptions = append(predictOptions, ggllm.SetTokenCallback(func(token string) bool {
-		if token == "<|endoftext|>" {
-			return true
-		}
-		results <- token
-		return true
-	}))
-
-	go func() {
-		_, err := llm.falcon.Predict(opts.Prompt, predictOptions...)
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		close(results)
-	}()
-
-	return nil
-}
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -18,7 +18,6 @@ const (
 	LlamaBackend        = "llama"
 	LlamaStableBackend  = "llama-stable"
 	LLamaCPP            = "llama-cpp"
-	BloomzBackend       = "bloomz"
 	StarcoderBackend    = "starcoder"
 	GPTJBackend         = "gptj"
 	DollyBackend        = "dolly"
@@ -30,7 +29,6 @@ const (
 	Gpt4AllMptBackend   = "gpt4all-mpt"
 	Gpt4AllJBackend     = "gpt4all-j"
 	Gpt4All             = "gpt4all"
-	FalconBackend       = "falcon"
 	FalconGGMLBackend   = "falcon-ggml"

 	BertEmbeddingsBackend  = "bert-embeddings"
@@ -46,7 +44,6 @@ var AutoLoadBackends []string = []string{
 	LlamaStableBackend,
 	LlamaBackend,
 	Gpt4All,
-	FalconBackend,
 	GPTNeoXBackend,
 	BertEmbeddingsBackend,
 	FalconGGMLBackend,
@@ -56,7 +53,6 @@ var AutoLoadBackends []string = []string{
 	MPTBackend,
 	ReplitBackend,
 	StarcoderBackend,
-	BloomzBackend,
 	RwkvBackend,
 	WhisperBackend,
 	StableDiffusionBackend,
Author	SHA1	Message	Date
LocalAI [bot]	6ef7ea2635	⬆️ Update ggerganov/llama.cpp (#1207 ) Signed-off-by: GitHub <noreply@github.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-10-30 08:00:36 +00:00
Ettore Di Giacinto	f8c00fbaf1	ci: enlarge download timeout window Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-10-29 22:09:35 +01:00
Ettore Di Giacinto	d9a42cc4c5	ci: run only cublas on selfhosted (#1224 ) * ci: run only cublas on selfhosted Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * debug Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * update git Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * change testing embeddings model link Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-10-29 22:04:43 +01:00
Ettore Di Giacinto	fc0bc32814	ci: use self-hosted to build container images (#1206 ) ci: use self-hosted Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-10-26 21:13:40 +02:00
Ettore Di Giacinto	c62504ac92	cleanup: drop bloomz and ggllm as now supported by llama.cpp (#1217 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-10-26 07:43:31 +02:00
Ettore Di Giacinto	f227e918f9	feat(llama.cpp): Bump llama.cpp, adapt grpc server (#1211 ) * feat(llama.cpp): Bump llama.cpp, adapt grpc server Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci: fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-10-25 20:56:25 +02:00
Ettore Di Giacinto	c132dbadce	docs(examples): Add mistral example (#1214 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-10-25 20:56:12 +02:00