fix(llama.cpp): fix eos without cache

fix(whisper.cpp): Add stubs and -lcuda
deps(whisper.cpp): update, fix cublas build
2026-05-21 23:26:26 -04:00 · 2024-03-18 12:14:16 +01:00 · 2024-03-18 12:13:39 +01:00 · 2024-03-16 10:38:57 +01:00 · 2024-03-15 23:51:03 +00:00 · 2024-03-15 18:14:23 +01:00
6 changed files with 60 additions and 37 deletions
--- a/18
+++ b/18
@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 # llama.cpp versions
-GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
+GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
-CPPLLAMA_VERSION?=19885d205e768579ab090d1e99281cae58c21b54
+CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,13 +19,13 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2
 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
-PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
@@ -91,10 +91,13 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
 # required by whisper.cpp
 	export WHISPER_CUBLAS=1
 	CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@@ -462,9 +465,6 @@ backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
 # TODO: every binary should have its own folder instead, so can have different  implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
 endif
 ## BACKEND CPP LLAMA START
 # Sets the variables in case it has to build the gRPC locally.
@@ -494,7 +494,7 @@ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif
 backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -18,6 +18,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1084,7 +1084,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }
-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -30,6 +30,7 @@ dependencies:
      - async-timeout==4.0.3
      - attrs==23.1.0
      - bark==0.1.5
      - bitsandbytes==0.43.0
      - boto3==1.28.61
      - botocore==1.31.61
      - certifi==2023.7.22
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -23,7 +23,7 @@ if XPU:
    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
    from transformers import AutoTokenizer, AutoModel, set_seed
 else:
-    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -75,18 +75,50 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        compute = "auto"
        if request.F16Memory == True:
            compute=torch.bfloat16
        self.CUDA = request.CUDA
        device_map="cpu"
        quantization = None
        if self.CUDA:
            if request.Device:
                device_map=request.Device
            else:
                device_map="cuda:0"
            if request.Quantization == "bnb_4bit":
                quantization = BitsAndBytesConfig(
                    load_in_4bit = True,
                    bnb_4bit_compute_dtype = compute,
                    bnb_4bit_quant_type = "nf4",
                    bnb_4bit_use_double_quant = True,
                    load_in_8bit = False,
                )
            elif request.Quantization == "bnb_8bit":
                quantization = BitsAndBytesConfig(
                    load_in_4bit=False,
                    bnb_4bit_compute_dtype = None,
                    load_in_8bit=True,                                   
                )
        try:
            if request.Type == "AutoModelForCausalLM":
                if XPU:
                    if quantization == "xpu_4bit":
                        xpu_4bit = True
                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
-                                              device_map="xpu", load_in_4bit=True)
+                                              device_map="xpu", load_in_4bit=xpu_4bit)
                else:
-                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
            else:
-                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,  use_safetensors=True,  quantization_config=quantization, device_map=device_map, torch_dtype=compute)
-
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.CUDA = False
            self.XPU = False
            if XPU:
@@ -97,13 +129,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                except Exception as err:
                    print("Not using XPU:", err, file=sys.stderr)
            if request.CUDA or torch.cuda.is_available():
                try:
                    print("Loading model", model_name, "to CUDA.", file=sys.stderr)
                    self.model = self.model.to("cuda")
                    self.CUDA = True
                except Exception as err:
                    print("Not using CUDA:", err, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -130,13 +155,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")    
        # Create word embeddings
-        model_output = self.model(**encoded_input)
+        if self.CUDA:
            encoded_input = encoded_input.to("cuda")
        with torch.no_grad():    
            model_output = self.model(**encoded_input)
        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
+        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
        print("Embeddings:", sentence_embeddings, file=sys.stderr)
-        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
+        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
    def Predict(self, request, context):
        """
@@ -163,12 +192,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if XPU:
            inputs = inputs.to("xpu")
-        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
+        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
-
+        generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
        generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        # Remove prompt from response if present
        if request.Prompt in generated_text:
            generated_text = generated_text.replace(request.Prompt, "")
        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -10,10 +10,6 @@ import (
 )
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	if !backendConfig.Embeddings {
 		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
 	}
 	modelFile := backendConfig.Model
 	grpcOpts := gRPCModelOpts(backendConfig)
Author	SHA1	Message	Date
Ettore Di Giacinto	495191a54a	fix(llama.cpp): fix eos without cache	2024-03-18 12:14:16 +01:00
Ettore Di Giacinto	b790fca180	fix(whisper.cpp): Add stubs and -lcuda	2024-03-18 12:13:39 +01:00
Ettore Di Giacinto	0663f66205	deps(whisper.cpp): update, fix cublas build	2024-03-16 10:38:57 +01:00
LocalAI [bot]	5826fb8e6d	⬆️ Update mudler/go-piper (#1844 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-03-15 23:51:03 +00:00
Ettore Di Giacinto	89351f1a7d	feat(embeddings): do not require to be configured (#1842 ) Certain engines requires to know during model loading if the embedding feature has to be enabled, however, it is impractical to have to set it to ALL the backends that supports embeddings. There are transformers and sentencentransformers that seamelessly handle both cases, without having this settings to be explicitly enabled. The case sussist only for ggml-based models that needs to enable featuresets during model loading (and thus settings `embedding` is required), however most of the other engines does not require this. This change disables the check done at code side, making easier to use embeddings by not having to specify explicitly `embeddings: true`. Part of: https://github.com/mudler/LocalAI/issues/1373	2024-03-15 18:14:23 +01:00
Ettore Di Giacinto	ae2e4fc2fe	docs(transformers): add docs section about transformers (#1841 )	2024-03-15 18:13:30 +01:00
Dave	db199f61da	fix: osx build default.metallib (#1837 ) fix: osx build default.metallib (#1837) * port osx fix from refactor pr to slim pr * manually bump llama.cpp version to unstick CI?	2024-03-15 08:18:58 +00:00
LocalAI [bot]	44adbd2c75	⬆️ Update go-skynet/go-llama.cpp (#1835 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-03-14 23:06:42 +00:00
Ettore Di Giacinto	20136ca8b7	feat(tts): add Elevenlabs and OpenAI TTS compatibility layer (#1834 ) * feat(elevenlabs): map elevenlabs API support to TTS This allows elevenlabs Clients to work automatically with LocalAI by supporting the elevenlabs API. The elevenlabs server endpoint is implemented such as it is wired to the TTS endpoints. Fixes: https://github.com/mudler/LocalAI/issues/1809 * feat(openai/tts): compat layer with openai tts Fixes: #1276 * fix: adapt tts CLI	2024-03-14 23:08:34 +01:00
Dave	45d520f913	fix: OSX Build Files for llama.cpp (#1836 ) bot ate my changes, seperate branch	2024-03-14 23:07:47 +01:00
fakezeta	3882130911	feat: Add Bitsandbytes quantization for transformer backend enhancement #1775 and fix: Transformer backend error on CUDA #1774 (#1823 ) * fixes #1775 and #1774 Add BitsAndBytes Quantization and fixes embedding on CUDA devices * Manage 4bit and 8 bit quantization Manage different BitsAndBytes options with the quantization: parameter in yaml * fix compilation errors on non CUDA environment	2024-03-14 23:06:30 +01:00