Compare commits

..

11 Commits

Author SHA1 Message Date
Ettore Di Giacinto
495191a54a fix(llama.cpp): fix eos without cache 2024-03-18 12:14:16 +01:00
Ettore Di Giacinto
b790fca180 fix(whisper.cpp): Add stubs and -lcuda 2024-03-18 12:13:39 +01:00
Ettore Di Giacinto
0663f66205 deps(whisper.cpp): update, fix cublas build 2024-03-16 10:38:57 +01:00
LocalAI [bot]
5826fb8e6d ⬆️ Update mudler/go-piper (#1844)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-15 23:51:03 +00:00
Ettore Di Giacinto
89351f1a7d feat(embeddings): do not require to be configured (#1842)
Certain engines requires to know during model loading
if the embedding feature has to be enabled, however, it is impractical
to have to set it to ALL the backends that supports embeddings.

There are transformers and sentencentransformers that seamelessly handle
both cases, without having this settings to be explicitly enabled.

The case sussist only for ggml-based models that needs to enable
featuresets during model loading (and thus settings `embedding` is
required), however most of the other engines does not require this.

This change disables the check done at code side, making easier to use
embeddings by not having to specify explicitly `embeddings: true`.

Part of: https://github.com/mudler/LocalAI/issues/1373
2024-03-15 18:14:23 +01:00
Ettore Di Giacinto
ae2e4fc2fe docs(transformers): add docs section about transformers (#1841) 2024-03-15 18:13:30 +01:00
Dave
db199f61da fix: osx build default.metallib (#1837)
fix: osx build default.metallib (#1837)
* port osx fix from refactor pr to slim pr
* manually bump llama.cpp version to unstick CI?
2024-03-15 08:18:58 +00:00
LocalAI [bot]
44adbd2c75 ⬆️ Update go-skynet/go-llama.cpp (#1835)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-14 23:06:42 +00:00
Ettore Di Giacinto
20136ca8b7 feat(tts): add Elevenlabs and OpenAI TTS compatibility layer (#1834)
* feat(elevenlabs): map elevenlabs API support to TTS

This allows elevenlabs Clients to work automatically with LocalAI by
supporting the elevenlabs API.

The elevenlabs server endpoint is implemented such as it is wired to the
TTS endpoints.

Fixes: https://github.com/mudler/LocalAI/issues/1809

* feat(openai/tts): compat layer with openai tts

Fixes: #1276

* fix: adapt tts CLI
2024-03-14 23:08:34 +01:00
Dave
45d520f913 fix: OSX Build Files for llama.cpp (#1836)
bot ate my changes, seperate branch
2024-03-14 23:07:47 +01:00
fakezeta
3882130911 feat: Add Bitsandbytes quantization for transformer backend enhancement #1775 and fix: Transformer backend error on CUDA #1774 (#1823)
* fixes #1775 and #1774

Add BitsAndBytes Quantization and fixes embedding on CUDA devices

* Manage 4bit and 8 bit quantization

Manage different BitsAndBytes options with the quantization: parameter in yaml

* fix compilation errors on non CUDA environment
2024-03-14 23:06:30 +01:00
6 changed files with 60 additions and 37 deletions

View File

@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai BINARY_NAME=local-ai
# llama.cpp versions # llama.cpp versions
GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0 GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=19885d205e768579ab090d1e99281cae58c21b54 CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314
# gpt4all version # gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,13 +19,13 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
# whisper.cpp version # whisper.cpp version
WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346 WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2
# bert.cpp version # bert.cpp version
BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
# go-piper version # go-piper version
PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
# stablediffusion version # stablediffusion version
STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485 STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
@@ -91,10 +91,13 @@ ifeq ($(BUILD_TYPE),openblas)
export WHISPER_OPENBLAS=1 export WHISPER_OPENBLAS=1
endif endif
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH)
export LLAMA_CUBLAS=1 export LLAMA_CUBLAS=1
# required by whisper.cpp
export WHISPER_CUBLAS=1 export WHISPER_CUBLAS=1
CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
endif endif
ifeq ($(BUILD_TYPE),hipblas) ifeq ($(BUILD_TYPE),hipblas)
@@ -462,9 +465,6 @@ backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
# TODO: every binary should have its own folder instead, so can have different implementations # TODO: every binary should have its own folder instead, so can have different implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
endif
## BACKEND CPP LLAMA START ## BACKEND CPP LLAMA START
# Sets the variables in case it has to build the gRPC locally. # Sets the variables in case it has to build the gRPC locally.
@@ -494,7 +494,7 @@ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
# TODO: every binary should have its own folder instead, so can have different metal implementations # TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal) ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/ cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
endif endif
backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a

View File

@@ -18,6 +18,7 @@ else ifeq ($(BUILD_TYPE),clblas)
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas) else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)

View File

@@ -1084,7 +1084,7 @@ struct llama_server_context
slot.has_next_token = false; slot.has_next_token = false;
} }
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) if (result.tok == llama_token_eos(model))
{ {
slot.stopped_eos = true; slot.stopped_eos = true;
slot.has_next_token = false; slot.has_next_token = false;

View File

@@ -30,6 +30,7 @@ dependencies:
- async-timeout==4.0.3 - async-timeout==4.0.3
- attrs==23.1.0 - attrs==23.1.0
- bark==0.1.5 - bark==0.1.5
- bitsandbytes==0.43.0
- boto3==1.28.61 - boto3==1.28.61
- botocore==1.31.61 - botocore==1.31.61
- certifi==2023.7.22 - certifi==2023.7.22

View File

@@ -23,7 +23,7 @@ if XPU:
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModel, set_seed from transformers import AutoTokenizer, AutoModel, set_seed
else: else:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
_ONE_DAY_IN_SECONDS = 60 * 60 * 24 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -75,18 +75,50 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
A Result object that contains the result of the LoadModel operation. A Result object that contains the result of the LoadModel operation.
""" """
model_name = request.Model model_name = request.Model
compute = "auto"
if request.F16Memory == True:
compute=torch.bfloat16
self.CUDA = request.CUDA
device_map="cpu"
quantization = None
if self.CUDA:
if request.Device:
device_map=request.Device
else:
device_map="cuda:0"
if request.Quantization == "bnb_4bit":
quantization = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_compute_dtype = compute,
bnb_4bit_quant_type = "nf4",
bnb_4bit_use_double_quant = True,
load_in_8bit = False,
)
elif request.Quantization == "bnb_8bit":
quantization = BitsAndBytesConfig(
load_in_4bit=False,
bnb_4bit_compute_dtype = None,
load_in_8bit=True,
)
try: try:
if request.Type == "AutoModelForCausalLM": if request.Type == "AutoModelForCausalLM":
if XPU: if XPU:
if quantization == "xpu_4bit":
xpu_4bit = True
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
device_map="xpu", load_in_4bit=True) device_map="xpu", load_in_4bit=xpu_4bit)
else: else:
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode) self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
else: else:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode) self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.CUDA = False
self.XPU = False self.XPU = False
if XPU: if XPU:
@@ -97,13 +129,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
except Exception as err: except Exception as err:
print("Not using XPU:", err, file=sys.stderr) print("Not using XPU:", err, file=sys.stderr)
if request.CUDA or torch.cuda.is_available():
try:
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
self.CUDA = True
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
except Exception as err: except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service # Implement your logic here for the LoadModel service
@@ -130,13 +155,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt") encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings # Create word embeddings
model_output = self.model(**encoded_input) if self.CUDA:
encoded_input = encoded_input.to("cuda")
with torch.no_grad():
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
print("Embeddings:", sentence_embeddings, file=sys.stderr) print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings) return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
def Predict(self, request, context): def Predict(self, request, context):
""" """
@@ -163,12 +192,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if XPU: if XPU:
inputs = inputs.to("xpu") inputs = inputs.to("xpu")
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP) outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

View File

@@ -10,10 +10,6 @@ import (
) )
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) { func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
if !backendConfig.Embeddings {
return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
}
modelFile := backendConfig.Model modelFile := backendConfig.Model
grpcOpts := gRPCModelOpts(backendConfig) grpcOpts := gRPCModelOpts(backendConfig)