Compare commits

..

4 Commits

Author SHA1 Message Date
Ettore Di Giacinto
5b8d6a31e2 docs(transformers): add docs section about transformers 2024-03-15 18:02:15 +01:00
Ettore Di Giacinto
f0752be4aa fix: adapt tts CLI 2024-03-14 19:24:50 +01:00
Ettore Di Giacinto
bafc9effad feat(openai/tts): compat layer with openai tts
Fixes: #1276
2024-03-14 18:15:28 +01:00
Ettore Di Giacinto
d2934dd69f feat(elevenlabs): map elevenlabs API support to TTS
This allows elevenlabs Clients to work automatically with LocalAI by
supporting the elevenlabs API.

The elevenlabs server endpoint is implemented such as it is wired to the
TTS endpoints.

Fixes: https://github.com/mudler/LocalAI/issues/1809
2024-03-14 18:12:47 +01:00
5 changed files with 34 additions and 54 deletions

View File

@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai
# llama.cpp versions
GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=d84c48505f60bcd358b82a751d40418c4d235643
CPPLLAMA_VERSION?=19885d205e768579ab090d1e99281cae58c21b54
# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -25,7 +25,7 @@ WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
# go-piper version
PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
# stablediffusion version
STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
@@ -462,6 +462,9 @@ backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
# TODO: every binary should have its own folder instead, so can have different implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
endif
## BACKEND CPP LLAMA START
# Sets the variables in case it has to build the gRPC locally.
@@ -491,7 +494,7 @@ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
# TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
endif
backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a

View File

@@ -18,7 +18,6 @@ else ifeq ($(BUILD_TYPE),clblas)
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
endif
ifeq ($(BUILD_TYPE),sycl_f16)

View File

@@ -30,7 +30,6 @@ dependencies:
- async-timeout==4.0.3
- attrs==23.1.0
- bark==0.1.5
- bitsandbytes==0.43.0
- boto3==1.28.61
- botocore==1.31.61
- certifi==2023.7.22

View File

@@ -23,7 +23,7 @@ if XPU:
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModel, set_seed
else:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -75,50 +75,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
compute = "auto"
if request.F16Memory == True:
compute=torch.bfloat16
self.CUDA = request.CUDA
device_map="cpu"
quantization = None
if self.CUDA:
if request.Device:
device_map=request.Device
else:
device_map="cuda:0"
if request.Quantization == "bnb_4bit":
quantization = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_compute_dtype = compute,
bnb_4bit_quant_type = "nf4",
bnb_4bit_use_double_quant = True,
load_in_8bit = False,
)
elif request.Quantization == "bnb_8bit":
quantization = BitsAndBytesConfig(
load_in_4bit=False,
bnb_4bit_compute_dtype = None,
load_in_8bit=True,
)
try:
if request.Type == "AutoModelForCausalLM":
if XPU:
if quantization == "xpu_4bit":
xpu_4bit = True
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
device_map="xpu", load_in_4bit=xpu_4bit)
device_map="xpu", load_in_4bit=True)
else:
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
else:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.CUDA = False
self.XPU = False
if XPU:
@@ -129,6 +97,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
except Exception as err:
print("Not using XPU:", err, file=sys.stderr)
if request.CUDA or torch.cuda.is_available():
try:
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
self.CUDA = True
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service
@@ -155,17 +130,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings
if self.CUDA:
encoded_input = encoded_input.to("cuda")
with torch.no_grad():
model_output = self.model(**encoded_input)
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
def Predict(self, request, context):
"""
@@ -192,8 +163,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if XPU:
inputs = inputs.to("xpu")
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

View File

@@ -10,6 +10,10 @@ import (
)
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
if !backendConfig.Embeddings {
return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
}
modelFile := backendConfig.Model
grpcOpts := gRPCModelOpts(backendConfig)