docs(transformers): add docs section about transformers

fix: adapt tts CLI
feat(openai/tts): compat layer with openai tts
2026-02-03 11:13:31 -05:00 · 2024-03-15 18:02:15 +01:00 · 2024-03-14 19:24:50 +01:00 · 2024-03-14 18:15:28 +01:00 · 2024-03-14 18:12:47 +01:00
6 changed files with 37 additions and 60 deletions
--- a/18
+++ b/18
@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314
+CPPLLAMA_VERSION?=19885d205e768579ab090d1e99281cae58c21b54

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,13 +19,13 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2
+WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
+PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07

 # stablediffusion version
 STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
@@ -91,13 +91,10 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif

-
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
-# required by whisper.cpp
 	export WHISPER_CUBLAS=1
-	CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -465,6 +462,9 @@ backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
 # TODO: every binary should have its own folder instead, so can have different  implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
+endif

 ## BACKEND CPP LLAMA START
 # Sets the variables in case it has to build the gRPC locally.
@@ -494,7 +494,7 @@ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

 backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -18,7 +18,6 @@ else ifeq ($(BUILD_TYPE),clblas)
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
-# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1084,7 +1084,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (result.tok == llama_token_eos(model))
+        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -30,7 +30,6 @@ dependencies:
      - async-timeout==4.0.3
      - attrs==23.1.0
      - bark==0.1.5
-      - bitsandbytes==0.43.0
      - boto3==1.28.61
      - botocore==1.31.61
      - certifi==2023.7.22
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -23,7 +23,7 @@ if XPU:
    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
    from transformers import AutoTokenizer, AutoModel, set_seed
 else:
-    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -75,50 +75,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
-
-        compute = "auto"
-        if request.F16Memory == True:
-            compute=torch.bfloat16
-
-        self.CUDA = request.CUDA
-
-        device_map="cpu"
-
-        quantization = None
-
-        if self.CUDA:
-            if request.Device:
-                device_map=request.Device
-            else:
-                device_map="cuda:0"
-            if request.Quantization == "bnb_4bit":
-                quantization = BitsAndBytesConfig(
-                    load_in_4bit = True,
-                    bnb_4bit_compute_dtype = compute,
-                    bnb_4bit_quant_type = "nf4",
-                    bnb_4bit_use_double_quant = True,
-                    load_in_8bit = False,
-                )
-            elif request.Quantization == "bnb_8bit":
-                quantization = BitsAndBytesConfig(
-                    load_in_4bit=False,
-                    bnb_4bit_compute_dtype = None,
-                    load_in_8bit=True,                                   
-                )
-                                                   
-    
        try:
            if request.Type == "AutoModelForCausalLM":
                if XPU:
-                    if quantization == "xpu_4bit":
-                        xpu_4bit = True
                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
-                                              device_map="xpu", load_in_4bit=xpu_4bit)
+                                              device_map="xpu", load_in_4bit=True)
                else:
-                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
            else:
-                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,  use_safetensors=True,  quantization_config=quantization, device_map=device_map, torch_dtype=compute)
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
+                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.CUDA = False
            self.XPU = False

            if XPU:
@@ -129,6 +97,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                except Exception as err:
                    print("Not using XPU:", err, file=sys.stderr)

+            if request.CUDA or torch.cuda.is_available():
+                try:
+                    print("Loading model", model_name, "to CUDA.", file=sys.stderr)
+                    self.model = self.model.to("cuda")
+                    self.CUDA = True
+                except Exception as err:
+                    print("Not using CUDA:", err, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -155,17 +130,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")    

        # Create word embeddings
-        if self.CUDA:
-            encoded_input = encoded_input.to("cuda")
-
-        with torch.no_grad():    
-            model_output = self.model(**encoded_input)
+        model_output = self.model(**encoded_input)

        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
        print("Embeddings:", sentence_embeddings, file=sys.stderr)
-        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
+        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)

    def Predict(self, request, context):
        """
@@ -192,8 +163,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if XPU:
            inputs = inputs.to("xpu")

-        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
-        generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
+        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
+
+        generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+        # Remove prompt from response if present
+        if request.Prompt in generated_text:
+            generated_text = generated_text.replace(request.Prompt, "")

        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -10,6 +10,10 @@ import (
 )

 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
+	if !backendConfig.Embeddings {
+		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
+	}
+
 	modelFile := backendConfig.Model

 	grpcOpts := gRPCModelOpts(backendConfig)
Author	SHA1	Message	Date
Ettore Di Giacinto	5b8d6a31e2	docs(transformers): add docs section about transformers	2024-03-15 18:02:15 +01:00
Ettore Di Giacinto	f0752be4aa	fix: adapt tts CLI	2024-03-14 19:24:50 +01:00
Ettore Di Giacinto	bafc9effad	feat(openai/tts): compat layer with openai tts Fixes: #1276	2024-03-14 18:15:28 +01:00
Ettore Di Giacinto	d2934dd69f	feat(elevenlabs): map elevenlabs API support to TTS This allows elevenlabs Clients to work automatically with LocalAI by supporting the elevenlabs API. The elevenlabs server endpoint is implemented such as it is wired to the TTS endpoints. Fixes: https://github.com/mudler/LocalAI/issues/1809	2024-03-14 18:12:47 +01:00