Compare commits

..

4 Commits

Author SHA1 Message Date
Ettore Di Giacinto
5b8d6a31e2 docs(transformers): add docs section about transformers 2024-03-15 18:02:15 +01:00
Ettore Di Giacinto
f0752be4aa fix: adapt tts CLI 2024-03-14 19:24:50 +01:00
Ettore Di Giacinto
bafc9effad feat(openai/tts): compat layer with openai tts
Fixes: #1276
2024-03-14 18:15:28 +01:00
Ettore Di Giacinto
d2934dd69f feat(elevenlabs): map elevenlabs API support to TTS
This allows elevenlabs Clients to work automatically with LocalAI by
supporting the elevenlabs API.

The elevenlabs server endpoint is implemented such as it is wired to the
TTS endpoints.

Fixes: https://github.com/mudler/LocalAI/issues/1809
2024-03-14 18:12:47 +01:00
6 changed files with 37 additions and 60 deletions

View File

@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai BINARY_NAME=local-ai
# llama.cpp versions # llama.cpp versions
GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314 CPPLLAMA_VERSION?=19885d205e768579ab090d1e99281cae58c21b54
# gpt4all version # gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,13 +19,13 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
# whisper.cpp version # whisper.cpp version
WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2 WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
# bert.cpp version # bert.cpp version
BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
# go-piper version # go-piper version
PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759 PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
# stablediffusion version # stablediffusion version
STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485 STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
@@ -91,13 +91,10 @@ ifeq ($(BUILD_TYPE),openblas)
export WHISPER_OPENBLAS=1 export WHISPER_OPENBLAS=1
endif endif
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH) CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export LLAMA_CUBLAS=1 export LLAMA_CUBLAS=1
# required by whisper.cpp
export WHISPER_CUBLAS=1 export WHISPER_CUBLAS=1
CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
endif endif
ifeq ($(BUILD_TYPE),hipblas) ifeq ($(BUILD_TYPE),hipblas)
@@ -465,6 +462,9 @@ backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
# TODO: every binary should have its own folder instead, so can have different implementations # TODO: every binary should have its own folder instead, so can have different implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
endif
## BACKEND CPP LLAMA START ## BACKEND CPP LLAMA START
# Sets the variables in case it has to build the gRPC locally. # Sets the variables in case it has to build the gRPC locally.
@@ -494,7 +494,7 @@ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
# TODO: every binary should have its own folder instead, so can have different metal implementations # TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal) ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/ cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
endif endif
backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a

View File

@@ -18,7 +18,6 @@ else ifeq ($(BUILD_TYPE),clblas)
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas) else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)

View File

@@ -1084,7 +1084,7 @@ struct llama_server_context
slot.has_next_token = false; slot.has_next_token = false;
} }
if (result.tok == llama_token_eos(model)) if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
{ {
slot.stopped_eos = true; slot.stopped_eos = true;
slot.has_next_token = false; slot.has_next_token = false;

View File

@@ -30,7 +30,6 @@ dependencies:
- async-timeout==4.0.3 - async-timeout==4.0.3
- attrs==23.1.0 - attrs==23.1.0
- bark==0.1.5 - bark==0.1.5
- bitsandbytes==0.43.0
- boto3==1.28.61 - boto3==1.28.61
- botocore==1.31.61 - botocore==1.31.61
- certifi==2023.7.22 - certifi==2023.7.22

View File

@@ -23,7 +23,7 @@ if XPU:
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModel, set_seed from transformers import AutoTokenizer, AutoModel, set_seed
else: else:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
_ONE_DAY_IN_SECONDS = 60 * 60 * 24 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -75,50 +75,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
A Result object that contains the result of the LoadModel operation. A Result object that contains the result of the LoadModel operation.
""" """
model_name = request.Model model_name = request.Model
compute = "auto"
if request.F16Memory == True:
compute=torch.bfloat16
self.CUDA = request.CUDA
device_map="cpu"
quantization = None
if self.CUDA:
if request.Device:
device_map=request.Device
else:
device_map="cuda:0"
if request.Quantization == "bnb_4bit":
quantization = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_compute_dtype = compute,
bnb_4bit_quant_type = "nf4",
bnb_4bit_use_double_quant = True,
load_in_8bit = False,
)
elif request.Quantization == "bnb_8bit":
quantization = BitsAndBytesConfig(
load_in_4bit=False,
bnb_4bit_compute_dtype = None,
load_in_8bit=True,
)
try: try:
if request.Type == "AutoModelForCausalLM": if request.Type == "AutoModelForCausalLM":
if XPU: if XPU:
if quantization == "xpu_4bit":
xpu_4bit = True
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
device_map="xpu", load_in_4bit=xpu_4bit) device_map="xpu", load_in_4bit=True)
else: else:
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute) self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
else: else:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute) self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.CUDA = False
self.XPU = False self.XPU = False
if XPU: if XPU:
@@ -129,6 +97,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
except Exception as err: except Exception as err:
print("Not using XPU:", err, file=sys.stderr) print("Not using XPU:", err, file=sys.stderr)
if request.CUDA or torch.cuda.is_available():
try:
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
self.CUDA = True
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
except Exception as err: except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service # Implement your logic here for the LoadModel service
@@ -155,17 +130,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt") encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings # Create word embeddings
if self.CUDA: model_output = self.model(**encoded_input)
encoded_input = encoded_input.to("cuda")
with torch.no_grad():
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
print("Embeddings:", sentence_embeddings, file=sys.stderr) print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0]) return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
def Predict(self, request, context): def Predict(self, request, context):
""" """
@@ -192,8 +163,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if XPU: if XPU:
inputs = inputs.to("xpu") inputs = inputs.to("xpu")
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id) outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

View File

@@ -10,6 +10,10 @@ import (
) )
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) { func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
if !backendConfig.Embeddings {
return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
}
modelFile := backendConfig.Model modelFile := backendConfig.Model
grpcOpts := gRPCModelOpts(backendConfig) grpcOpts := gRPCModelOpts(backendConfig)