Merge branch 'master' into cleanup_deps

Drop also ttf files
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-23 16:20:01 -04:00 · 2024-08-21 13:10:46 +02:00 · 2024-08-21 13:03:26 +02:00 · 2024-08-21 13:02:19 +02:00 · 2024-08-21 13:02:19 +02:00 · 2024-08-21 13:02:19 +02:00
136 changed files with 1228 additions and 3701 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -32,22 +32,18 @@ config_remote() {
 }

 # Setup special .ssh files
-# Prints out lines of text to make things pretty
+#
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
-    echo "starting ~/.ssh directory setup..."
-    mkdir -p "${HOME}.ssh"
-    chmod 0700 "${HOME}/.ssh"
-    echo "-----"
    local files=("$@")
-    for file in "${files[@]}" ; do
+    for file in "${files[@]}"; then
        local cfile="/devcontainer-customization/${file}"
-        local hfile="${HOME}/.ssh/${file}"
+        local hfile="~/.ssh/${file}"
        if [ ! -f "${hfile}" ]; then
-            echo "copying \"${file}\""
+            echo "copying ${file}"
            cp "${cfile}" "${hfile}"
            chmod 600 "${hfile}"
        fi
    done
-    echo "~/.ssh directory setup complete!"
+    ls ~/.ssh
 }
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -56,7 +56,7 @@ jobs:
          rm -rfv ${{ matrix.variable }}_message.txt
          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -36,7 +36,7 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -294,7 +294,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -327,7 +327,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          brew install protobuf grpc libomp llvm
+          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
@@ -336,7 +336,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export CC=/opt/homebrew/opt/llvm/bin/clang
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -214,13 +214,12 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -25,7 +25,7 @@ jobs:
        run: |
          make protogen-go swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/39
+++ b/39
@@ -13,7 +13,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"


 RUN apt-get update && \
@@ -263,20 +263,14 @@ EOT
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd

-# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
-COPY Makefile .
-COPY go.mod .
-COPY go.sum .
-COPY backend/backend.proto ./backend/backend.proto
-COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
-COPY pkg/grpc ./pkg/grpc
-COPY pkg/stablediffusion ./pkg/stablediffusion
-RUN git init
-RUN make sources/go-stable-diffusion
-RUN touch prepare-sources
+COPY . .
+COPY .git .

-# Actually build the backend
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
+RUN make prepare
+
+
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 ###################################
 ###################################
@@ -291,20 +285,8 @@ COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build

-COPY . .
-COPY .git .
-
-RUN make prepare
-
 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas build
-## (both will use CUDA for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
-    else \
-        make build; \
-    fi
+RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -418,6 +400,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama \
    ; fi

 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
--- a/21
+++ b/21
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=2f3c1466ff46a2413b0e363a5005c46538186ee6

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=d65786ea540a5aef21f67cacfa6f134097727780

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -338,7 +338,7 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build

-prepare: prepare-sources $(OPTIONAL_TARGETS)
+prepare: prepare-sources gen-assets $(OPTIONAL_TARGETS)

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
@@ -534,10 +534,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -571,6 +571,14 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean

+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen
+
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean
+
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -667,6 +675,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
+	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/exllama2

 prepare-test-extra: protogen-python
@@ -837,7 +846,7 @@ endif

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

@@ -72,7 +72,6 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,7 +16,6 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
-  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}

@@ -271,17 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }

-message SoundGenerationRequest {
-  string text = 1;
-  string model = 2;
-  string dst = 3;
-  optional float duration = 4;
-  optional float temperature = 5;
-  optional bool sample = 6;
-  optional string src = 7;
-  optional int32 src_divisor = 8;
-}
-
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -17,10 +17,11 @@
 #include "common.h"
 #include "json.hpp"
 #include "llama.h"
+#include "grammar-parser.h"
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
-#include "sampling.h"
+
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -202,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;

    // sampling
-    struct gpt_sampler_params sparams;
-    gpt_sampler *ctx_sampling = nullptr;
+    struct llama_sampling_params sparams;
+    llama_sampling_context *ctx_sampling = nullptr;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -618,7 +619,7 @@ struct llama_server_context

    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        gpt_sampler_params default_sparams;
+        llama_sampling_params default_sparams;
 
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -627,7 +628,7 @@ struct llama_server_context
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
+        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -640,7 +641,7 @@ struct llama_server_context
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
+        slot->params.seed               = json_value(data, "seed",              default_params.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
@@ -664,7 +665,6 @@ struct llama_server_context
            slot->params.input_prefix = "";
        }

-
        if (data.count("input_suffix") != 0)
        {
            slot->params.input_suffix = data["input_suffix"];
@@ -683,10 +683,6 @@ struct llama_server_context
            slot->prompt = "";
        }

-        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
-        }
-        /*
        slot->sparams.penalty_prompt_tokens.clear();
        slot->sparams.use_penalty_prompt_tokens = false;
        const auto &penalty_prompt = data.find("penalty_prompt");
@@ -722,10 +718,14 @@ struct llama_server_context
                slot->sparams.use_penalty_prompt_tokens = true;
            }
        }
-      */

        slot->sparams.logit_bias.clear();

+        if (json_value(data, "ignore_eos", false))
+        {
+            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+        }
+
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
@@ -753,7 +753,7 @@ struct llama_server_context
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                    else if (el[0].is_string())
@@ -761,13 +761,13 @@ struct llama_server_context
                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                }
            }
        }
-        
+
        slot->params.antiprompt.clear();

        const auto &stop = data.find("stop");
@@ -781,22 +781,24 @@ struct llama_server_context
                }
            }
        }
-        
-        const auto & samplers = data.find("samplers");
-        if (samplers != data.end() && samplers->is_array()) {
+
+        const auto &samplers_sequence = data.find("samplers");
+        if (samplers_sequence != data.end() && samplers_sequence->is_array())
+        {
            std::vector<std::string> sampler_names;
-                for (const auto & name : *samplers) {
-                    if (name.is_string()) {
-                        sampler_names.emplace_back(name);
-                    }
+            for (const auto &sampler_name : *samplers_sequence)
+            {
+                if (sampler_name.is_string())
+                {
+                    sampler_names.emplace_back(sampler_name);
                }
-                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+            }
+            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
-                slot->sparams.samplers = default_sparams.samplers;
+            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
        }
-        

        if (multimodal)
        {
@@ -873,10 +875,10 @@ struct llama_server_context

        if (slot->ctx_sampling != nullptr)
        {
-            gpt_sampler_free(slot->ctx_sampling);
+            llama_sampling_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
-        //llama_set_rng_seed(ctx, slot->params.seed);
+        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;

        all_slots_are_idle = false;
@@ -886,7 +888,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

        return true;
    }
@@ -1004,13 +1006,11 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;

-/*
        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
        {
            // we can change penalty_prompt_tokens because it is always created from scratch each request
            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
        }
-        */

        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
@@ -1119,7 +1119,7 @@ struct llama_server_context
                continue;
            }

-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG_TEE("Error processing the given image");
                return false;
            }
@@ -1144,11 +1144,13 @@ struct llama_server_context

    json get_formated_generation(llama_client_slot &slot)
    {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers)
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+        std::vector<std::string> samplers_sequence;
+        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -1163,11 +1165,13 @@ struct llama_server_context
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typ_p},
+            {"typical_p",         slot.sparams.typical_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
+            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
+            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1175,13 +1179,13 @@ struct llama_server_context
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
-            {"ignore_eos",        slot.sparams.ignore_eos},
+            {"ignore_eos",        ignore_eos},
            {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+            {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers}
+            {"samplers",          samplers_sequence}
        };
    }

@@ -1710,7 +1714,7 @@ struct llama_server_context

                    if (!slot.params.cache_prompt)
                    {
-                        gpt_sampler_reset(slot.ctx_sampling);
+                        llama_sampling_reset(slot.ctx_sampling);

                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1722,7 +1726,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            gpt_sampler_accept(slot.ctx_sampling, token, false);
+                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1930,9 +1934,9 @@ struct llama_server_context
                }

                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);

-                gpt_sampler_accept(slot.ctx_sampling, id, true);
+                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1942,14 +1946,19 @@ struct llama_server_context
                    metrics.on_prompt_eval(slot);
                }

+                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;
-                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);

-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                const int32_t n_probs = slot.sparams.n_probs;
+                if (slot.sparams.temp <= 0 && n_probs > 0)
+                {
+                    // for llama_sample_token_greedy we need to sort candidates
+                    llama_sample_softmax(ctx, &cur_p);
+                }
+
+                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+                {
+                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
                }

                if (!process_token(result, slot))
@@ -2201,7 +2210,7 @@ static void params_parse(const backend::ModelOptions* request,
    params.model_alias =  request->modelfile();
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
-    params.cpuparams.n_threads = request->threads();
+    params.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +0,0 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-             for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
-+                patches_data[i] = i;
-             }
-             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-             free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,12 +1,5 @@
 #!/bin/bash

-## Patches
-## Apply patches from the `patches` directory
-for patch in $(ls patches); do
-    echo "Applying patch $patch"
-    patch -d llama.cpp/ -p1 < patches/$patch
-done 
-
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,4 +480,31 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }

    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
 }
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -0,0 +1,104 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-audio/wav"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+func ffmpegCommand(args []string) (string, error) {
+	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
+	cmd.Env = os.Environ()
+	out, err := cmd.CombinedOutput()
+	return string(out), err
+}
+
+// AudioToWav converts audio to wav for transcribe.
+// TODO: use https://github.com/mccoyst/ogg?
+func audioToWav(src, dst string) error {
+	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := ffmpegCommand(commandArgs)
+	if err != nil {
+		return fmt.Errorf("error: %w out: %s", err, out)
+	}
+	return nil
+}
+
+func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}
+
+	dir, err := os.MkdirTemp("", "whisper")
+	if err != nil {
+		return res, err
+	}
+	defer os.RemoveAll(dir)
+
+	convertedPath := filepath.Join(dir, "converted.wav")
+
+	if err := audioToWav(audiopath, convertedPath); err != nil {
+		return res, err
+	}
+
+	// Open samples
+	fh, err := os.Open(convertedPath)
+	if err != nil {
+		return res, err
+	}
+	defer fh.Close()
+
+	// Read samples
+	d := wav.NewDecoder(fh)
+	buf, err := d.FullPCMBuffer()
+	if err != nil {
+		return res, err
+	}
+
+	data := buf.AsFloat32Buffer().Data
+
+	// Process samples
+	context, err := model.NewContext()
+	if err != nil {
+		return res, err
+
+	}
+
+	context.SetThreads(threads)
+
+	if language != "" {
+		context.SetLanguage(language)
+	} else {
+		context.SetLanguage("auto")
+	}
+
+	if translate {
+		context.SetTranslate(true)
+	}
+
+	if err := context.Process(data, nil, nil); err != nil {
+		return res, err
+	}
+
+	for {
+		s, err := context.NextSegment()
+		if err != nil {
+			break
+		}
+
+		var tokens []int
+		for _, t := range s.Tokens {
+			tokens = append(tokens, t.Id)
+		}
+
+		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
+		res.Segments = append(res.Segments, segment)
+
+		res.Text += s.Text
+	}
+
+	return res, nil
+}
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -0,0 +1,26 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Whisper struct {
+	base.SingleThread
+	whisper whisper.Model
+}
+
+func (sd *Whisper) Load(opts *pb.ModelOptions) error {
+	// Note: the Model here is a path to a directory containing the model files
+	w, err := whisper.New(opts.ModelFile)
+	sd.whisper = w
+	return err
+}
+
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
+	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
+}
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -1,105 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"os"
-	"path/filepath"
-
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	"github.com/go-audio/wav"
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-type Whisper struct {
-	base.SingleThread
-	whisper whisper.Model
-}
-
-func (sd *Whisper) Load(opts *pb.ModelOptions) error {
-	// Note: the Model here is a path to a directory containing the model files
-	w, err := whisper.New(opts.ModelFile)
-	sd.whisper = w
-	return err
-}
-
-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
-
-	dir, err := os.MkdirTemp("", "whisper")
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer os.RemoveAll(dir)
-
-	convertedPath := filepath.Join(dir, "converted.wav")
-
-	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	// Open samples
-	fh, err := os.Open(convertedPath)
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer fh.Close()
-
-	// Read samples
-	d := wav.NewDecoder(fh)
-	buf, err := d.FullPCMBuffer()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	data := buf.AsFloat32Buffer().Data
-
-	// Process samples
-	context, err := sd.whisper.NewContext()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-
-	}
-
-	context.SetThreads(uint(opts.Threads))
-
-	if opts.Language != "" {
-		context.SetLanguage(opts.Language)
-	} else {
-		context.SetLanguage("auto")
-	}
-
-	if opts.Translate {
-		context.SetTranslate(true)
-	}
-
-	if err := context.Process(data, nil, nil); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	segments := []*pb.TranscriptSegment{}
-	text := ""
-	for {
-		s, err := context.NextSegment()
-		if err != nil {
-			break
-		}
-
-		var tokens []int32
-		for _, t := range s.Tokens {
-			tokens = append(tokens, int32(t.Id))
-		}
-
-		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
-		segments = append(segments, segment)
-
-		text += s.Text
-	}
-
-	return pb.TranscriptResult{
-		Segments: segments,
-		Text:     text,
-	}, nil
-
-}
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 TTS==0.22.0
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -168,7 +168,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.CFGScale != 0:
                self.cfg_scale = request.CFGScale

-            clipmodel = "Lykon/dreamshaper-8"
+            clipmodel = "runwayml/stable-diffusion-v1-5"
            if request.CLIPModel != "":
                clipmodel = request.CLIPModel
            clipsubfolder = "text_encoder"
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.65.4
 pillow
 protobuf
 certifi
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -53,7 +53,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
@@ -71,7 +71,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                print(response.message)
                self.assertTrue(response.success)
                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
@@ -81,4 +81,4 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Image gen service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
--- a/backend/python/exllama/.gitignore
+++ b/backend/python/exllama/.gitignore
@@ -0,0 +1 @@
+source
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -0,0 +1,25 @@
+export CONDA_ENV_PATH = "exllama.yml"
+
+.PHONY: exllama
+exllama: protogen
+	bash install.sh ${CONDA_ENV_PATH}
+
+.PHONY: run
+run: protogen
+	@echo "Running exllama..."
+	bash run.sh
+	@echo "exllama run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	$(RM) -r venv source __pycache__
--- a/backend/python/exllama/README.md
+++ b/backend/python/exllama/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the exllama project
+
+```
+make exllama
+```
--- a/backend/python/exllama/backend.py
+++ b/backend/python/exllama/backend.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os, glob
+
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch import version as torch_version
+
+from source.tokenizer import ExLlamaTokenizer
+from source.generator import ExLlamaGenerator
+from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def generate(self,prompt, max_new_tokens):
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
+            model_directory = request.ModelFile
+
+            # Locate files we need within that directory
+            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
+            model_config_path = os.path.join(model_directory, "config.json")
+            st_pattern = os.path.join(model_directory, "*.safetensors")
+            model_path = glob.glob(st_pattern)[0]
+
+            # Create config, model, tokenizer and generator
+
+            config = ExLlamaConfig(model_config_path)               # create config from config.json
+            config.model_path = model_path                          # supply path to model weights file
+            if (request.ContextSize):
+                config.max_seq_len = request.ContextSize            # override max sequence length
+                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
+                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
+
+            # Set Rope scaling.
+            if (request.RopeFreqScale):
+                # Alpha value for Rope scaling. 
+                # Higher value increases context but adds perplexity.
+                # alpha_value and compress_pos_emb are mutually exclusive.
+                # https://github.com/turboderp/exllama/issues/115
+                config.alpha_value = request.RopeFreqScale
+                config.calculate_rotary_embedding_base()
+
+            model = ExLlama(config)                                 # create ExLlama instance and load the weights
+            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
+
+            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
+            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
+
+            self.generator= generator
+            self.model = model
+            self.tokenizer = tokenizer
+            self.cache = cache
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.15
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        self.generator.settings.token_repetition_penalty_max = penalty
+        self.generator.settings.temperature = request.Temperature
+        self.generator.settings.top_k = request.TopK
+        self.generator.settings.top_p = request.TopP
+
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+
+        if self.cache.batch_size == 1:
+            del self.cache
+            self.cache = ExLlamaCache(self.model, batch_size=2)
+            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+
+        t = self.generate(request.Prompt, tokens)
+
+        # Remove prompt from response if present
+        if request.Prompt in t:
+            t = t.replace(request.Prompt, "")
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+LIMIT_TARGETS="cublas"
+
+source $(dirname $0)/../common/libbackend.sh
+
+installRequirements
+
+git clone https://github.com/turboderp/exllama $MY_DIR/source
+uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
+
+cp -v ./*py $MY_DIR/source/
--- a/backend/python/exllama/requirements-cpu.txt
+++ b/backend/python/exllama/requirements-cpu.txt
@@ -0,0 +1,3 @@
+transformers
+accelerate
+torch
--- a/backend/python/exllama/requirements-cublas11.txt
+++ b/backend/python/exllama/requirements-cublas11.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch
+transformers
+accelerate
--- a/backend/python/exllama/requirements-cublas12.txt
+++ b/backend/python/exllama/requirements-cublas12.txt
@@ -0,0 +1,3 @@
+torch
+transformers
+accelerate
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -0,0 +1,4 @@
+grpcio==1.65.5
+protobuf
+certifi
+setuptools
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+LIMIT_TARGETS="cublas"
+BACKEND_FILE="${MY_DIR}/source/backend.py"
+
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
--- a/backend/python/exllama/test.sh
+++ b/backend/python/exllama/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.3.0+rocm6.0
-torchaudio==2.3.0+rocm6.0
+torch
+torchaudio
 transformers
-accelerate
+accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/transformers-musicgen/backend.py
+++ b/backend/python/transformers-musicgen/backend.py
@@ -15,7 +15,7 @@ import backend_pb2_grpc

 import grpc

-from scipy.io import wavfile
+from scipy.io.wavfile import write as write_wav
 from transformers import AutoProcessor, MusicgenForConditionalGeneration

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -63,61 +63,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        return backend_pb2.Result(message="Model loaded successfully", success=True)

-    def SoundGeneration(self, request, context):
-        model_name = request.model
-        if model_name == "":
-            return backend_pb2.Result(success=False, message="request.model is required")
-        try:
-            self.processor = AutoProcessor.from_pretrained(model_name)
-            self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-            inputs = None
-            if request.text == "":
-                inputs = self.model.get_unconditional_inputs(num_samples=1)
-            elif request.HasField('src'):
-                # TODO SECURITY CODE GOES HERE LOL
-                # WHO KNOWS IF THIS WORKS???
-                sample_rate, wsamples = wavfile.read('path_to_your_file.wav')
-                
-                if request.HasField('src_divisor'):
-                    wsamples = wsamples[: len(wsamples) // request.src_divisor]
-                
-                inputs = self.processor(
-                    audio=wsamples,
-                    sampling_rate=sample_rate,
-                    text=[request.text],
-                    padding=True,
-                    return_tensors="pt",
-                )
-            else:
-                inputs = self.processor(
-                    text=[request.text],
-                    padding=True,
-                    return_tensors="pt",
-                )
-            
-            tokens = 256
-            if request.HasField('duration'):
-                tokens = int(request.duration * 51.2) # 256 tokens = 5 seconds, therefore 51.2 tokens is one second
-            guidance = 3.0
-            if request.HasField('temperature'):
-                guidance = request.temperature
-            dosample = True
-            if request.HasField('sample'):
-                dosample = request.sample
-            audio_values = self.model.generate(**inputs, do_sample=dosample, guidance_scale=guidance, max_new_tokens=tokens)
-            print("[transformers-musicgen] SoundGeneration generated!", file=sys.stderr)
-            sampling_rate = self.model.config.audio_encoder.sampling_rate
-            wavfile.write(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
-            print("[transformers-musicgen] SoundGeneration saved to", request.dst, file=sys.stderr)
-            print("[transformers-musicgen] SoundGeneration for", file=sys.stderr)
-            print("[transformers-musicgen] SoundGeneration requested tokens", tokens, file=sys.stderr)
-            print(request, file=sys.stderr)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(success=True)
-
-
-# The TTS endpoint is older, and provides fewer features, but exists for compatibility reasons
    def TTS(self, request, context):
        model_name = request.model
        if model_name == "":
@@ -130,7 +75,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                padding=True,
                return_tensors="pt",
            )
-            tokens = 512 # No good place to set the "length" in TTS, so use 10s as a sane default
+            tokens = 256
+            # TODO get tokens from request?
            audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
            print("[transformers-musicgen] TTS generated!", file=sys.stderr)
            sampling_rate = self.model.config.audio_encoder.sampling_rate
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers-musicgen/test.py
+++ b/backend/python/transformers-musicgen/test.py
@@ -63,7 +63,7 @@ class TestBackendServicer(unittest.TestCase):

    def test_tts(self):
        """
-        This method tests if TTS is generated successfully
+        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
@@ -77,24 +77,5 @@ class TestBackendServicer(unittest.TestCase):
        except Exception as err:
            print(err)
            self.fail("TTS service failed")
-        finally:
-            self.tearDown()
-
-    def test_sound_generation(self):
-        """
-        This method tests if SoundGeneration is generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
-                self.assertTrue(response.success)
-                sg_request = backend_pb2.SoundGenerationRequest(text="80s TV news production music hit for tonight's biggest story")
-                sg_response = stub.SoundGeneration(sg_request)
-                self.assertIsNotNone(sg_response)
-        except Exception as err:
-            print(err)
-            self.fail("SoundGeneration service failed")
        finally:
            self.tearDown()
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -135,26 +135,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        res = await gen.__anext__()
        return res

-    def Embedding(self, request, context):
-        """
-        A gRPC method that calculates embeddings for a given sentence.
-
-        Args:
-            request: An EmbeddingRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            An EmbeddingResult object that contains the calculated embeddings.
-        """
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        outputs = self.model.encode(request.Embeddings)
-        # Check if we have one result at least
-        if len(outputs) == 0:
-            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
-            context.set_details("No embeddings were calculated.")
-            return backend_pb2.EmbeddingResult()
-        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
-
    async def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -72,28 +72,5 @@ class TestBackendServicer(unittest.TestCase):
        except Exception as err:
            print(err)
            self.fail("text service failed")
-        finally:
-            self.tearDown()
-
-    def test_embedding(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
-                self.assertTrue(response.success)
-                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
-                embedding_response = stub.Embedding(embedding_request)
-                self.assertIsNotNone(embedding_response.embeddings)
-                # assert that is a list of floats
-                self.assertIsInstance(embedding_response.embeddings, list)
-                # assert that the list is not empty
-                self.assertTrue(len(embedding_response.embeddings) > 0)
-        except Exception as err:
-            print(err)
-            self.fail("Embedding service failed")
        finally:
            self.tearDown()
--- a/core/backend/backend_suite_test.go
+++ b/core/backend/backend_suite_test.go
@@ -1,13 +0,0 @@
-package backend_test
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestBackend(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "Backend test suite")
-}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -9,8 +9,6 @@ import (
 	"sync"
 	"unicode/utf8"

-	"github.com/rs/zerolog/log"
-
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"

@@ -89,7 +87,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
+				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}
@@ -183,37 +181,13 @@ func Finetune(config config.BackendConfig, input, prediction string) string {
 		mu.Lock()
 		reg, ok := cutstrings[c]
 		if !ok {
-			r, err := regexp.Compile(c)
-			if err != nil {
-				log.Fatal().Err(err).Msg("failed to compile regex")
-			}
-			cutstrings[c] = r
+			cutstrings[c] = regexp.MustCompile(c)
 			reg = cutstrings[c]
 		}
 		mu.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}

-	// extract results from the response which can be for instance inside XML tags
-	var predResult string
-	for _, r := range config.ExtractRegex {
-		mu.Lock()
-		reg, ok := cutstrings[r]
-		if !ok {
-			regex, err := regexp.Compile(r)
-			if err != nil {
-				log.Fatal().Err(err).Msg("failed to compile regex")
-			}
-			cutstrings[r] = regex
-			reg = regex
-		}
-		mu.Unlock()
-		predResult += reg.FindString(prediction)
-	}
-	if predResult != "" {
-		prediction = predResult
-	}
-
 	for _, c := range config.TrimSpace {
 		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
 	}
--- a/core/backend/llm_test.go
+++ b/core/backend/llm_test.go
@@ -1,109 +0,0 @@
-package backend_test
-
-import (
-	. "github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("LLM tests", func() {
-	Context("Finetune LLM output", func() {
-		var (
-			testConfig config.BackendConfig
-			input      string
-			prediction string
-			result     string
-		)
-
-		BeforeEach(func() {
-			testConfig = config.BackendConfig{
-				PredictionOptions: schema.PredictionOptions{
-					Echo: false,
-				},
-				LLMConfig: config.LLMConfig{
-					Cutstrings:   []string{`<.*?>`},                  // Example regex for removing XML tags
-					ExtractRegex: []string{`<result>(.*?)</result>`}, // Example regex to extract from tags
-					TrimSpace:    []string{" ", "\n"},
-					TrimSuffix:   []string{".", "!"},
-				},
-			}
-		})
-
-		Context("when echo is enabled", func() {
-			BeforeEach(func() {
-				testConfig.Echo = true
-				input = "Hello"
-				prediction = "World"
-			})
-
-			It("should prepend input to prediction", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("HelloWorld"))
-			})
-		})
-
-		Context("when echo is disabled", func() {
-			BeforeEach(func() {
-				testConfig.Echo = false
-				input = "Hello"
-				prediction = "World"
-			})
-
-			It("should not modify the prediction with input", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("World"))
-			})
-		})
-
-		Context("when cutstrings regex is applied", func() {
-			BeforeEach(func() {
-				input = ""
-				prediction = "<div>Hello</div> World"
-			})
-
-			It("should remove substrings matching cutstrings regex", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("Hello World"))
-			})
-		})
-
-		Context("when extract regex is applied", func() {
-			BeforeEach(func() {
-				input = ""
-				prediction = "<response><result>42</result></response>"
-			})
-
-			It("should extract substrings matching the extract regex", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("42"))
-			})
-		})
-
-		Context("when trimming spaces", func() {
-			BeforeEach(func() {
-				input = ""
-				prediction = "   Hello World   "
-			})
-
-			It("should trim spaces from the prediction", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("Hello World"))
-			})
-		})
-
-		Context("when trimming suffixes", func() {
-			BeforeEach(func() {
-				input = ""
-				prediction = "Hello World."
-			})
-
-			It("should trim suffixes from the prediction", func() {
-				result = Finetune(testConfig, input, prediction)
-				Expect(result).To(Equal("Hello World"))
-			})
-		})
-	})
-})
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -1,74 +0,0 @@
-package backend
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-func SoundGeneration(
-	backend string,
-	modelFile string,
-	text string,
-	duration *float32,
-	temperature *float32,
-	doSample *bool,
-	sourceFile *string,
-	sourceDivisor *int32,
-	loader *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	backendConfig config.BackendConfig,
-) (string, *proto.Result, error) {
-	if backend == "" {
-		return "", nil, fmt.Errorf("backend is a required parameter")
-	}
-
-	grpcOpts := gRPCModelOpts(backendConfig)
-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
-		model.WithBackendString(backend),
-		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-	})
-
-	soundGenModel, err := loader.BackendLoader(opts...)
-	if err != nil {
-		return "", nil, err
-	}
-
-	if soundGenModel == nil {
-		return "", nil, fmt.Errorf("could not load sound generation model")
-	}
-
-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
-		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
-	}
-
-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
-	filePath := filepath.Join(appConfig.AudioDir, fileName)
-
-	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
-		Text:        text,
-		Model:       modelFile,
-		Dst:         filePath,
-		Sample:      doSample,
-		Duration:    duration,
-		Temperature: temperature,
-		Src:         sourceFile,
-		SrcDivisor:  sourceDivisor,
-	})
-
-	// return RPC error if any
-	if !res.Success {
-		return "", nil, fmt.Errorf(res.Message)
-	}
-
-	return filePath, res, err
-}
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -3,13 +3,12 @@ package backend
 import (
 	"context"
 	"fmt"
-	"time"

 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"

 	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 )

 func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
@@ -22,40 +21,19 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 		model.WithAssetDir(appConfig.AssetsDestination),
 	})

-	transcriptionModel, err := ml.BackendLoader(opts...)
+	whisperModel, err := ml.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}

-	if transcriptionModel == nil {
-		return nil, fmt.Errorf("could not load transcription model")
+	if whisperModel == nil {
+		return nil, fmt.Errorf("could not load whisper model")
 	}

-	r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
+	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
 		Dst:       audio,
 		Language:  language,
 		Translate: translate,
 		Threads:   uint32(*backendConfig.Threads),
 	})
-	if err != nil {
-		return nil, err
-	}
-	tr := &schema.TranscriptionResult{
-		Text: r.Text,
-	}
-	for _, s := range r.Segments {
-		var tks []int
-		for _, t := range s.Tokens {
-			tks = append(tks, int(t))
-		}
-		tr.Segments = append(tr.Segments,
-			schema.Segment{
-				Text:   s.Text,
-				Id:     int(s.Id),
-				Start:  time.Duration(s.Start),
-				End:    time.Duration(s.End),
-				Tokens: tks,
-			})
-	}
-	return tr, err
 }
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -9,15 +9,31 @@ import (
 	"github.com/mudler/LocalAI/core/config"

 	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 )

+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext
+
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+	}
+}
+
 func ModelTTS(
 	backend,
 	text,
 	modelFile,
-	voice,
+	voice ,
 	language string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
@@ -50,7 +66,7 @@ func ModelTTS(
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
+	fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
 	filePath := filepath.Join(appConfig.AudioDir, fileName)

 	// If the model file is not empty, we pass it joined with the model path
@@ -72,15 +88,12 @@ func ModelTTS(
 	}

 	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
-		Text:     text,
-		Model:    modelPath,
-		Voice:    voice,
-		Dst:      filePath,
+		Text:  text,
+		Model: modelPath,
+		Voice: voice,
+		Dst:   filePath,
 		Language: &language,
 	})
-	if err != nil {
-		return "", nil, err
-	}

 	// return RPC error if any
 	if !res.Success {
--- a/core/cli/api/p2p.go
+++ b/core/cli/api/p2p.go
@@ -1,80 +0,0 @@
-package cli_api
-
-import (
-	"context"
-	"fmt"
-	"net"
-	"os"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/p2p"
-	"github.com/mudler/edgevpn/pkg/node"
-
-	"github.com/rs/zerolog/log"
-)
-
-func StartP2PStack(ctx context.Context, address, token, networkID string, federated bool) error {
-	var n *node.Node
-	// Here we are avoiding creating multiple nodes:
-	// - if the federated mode is enabled, we create a federated node and expose a service
-	// - exposing a service creates a node with specific options, and we don't want to create another node
-
-	// If the federated mode is enabled, we expose a service to the local instance running
-	// at r.Address
-	if federated {
-		_, port, err := net.SplitHostPort(address)
-		if err != nil {
-			return err
-		}
-
-		// Here a new node is created and started
-		// and a service is exposed by the node
-		node, err := p2p.ExposeService(ctx, "localhost", port, token, p2p.NetworkID(networkID, p2p.FederatedID))
-		if err != nil {
-			return err
-		}
-
-		if err := p2p.ServiceDiscoverer(ctx, node, token, p2p.NetworkID(networkID, p2p.FederatedID), nil, false); err != nil {
-			return err
-		}
-
-		n = node
-	}
-
-	// If the p2p mode is enabled, we start the service discovery
-	if token != "" {
-		// If a node wasn't created previously, create it
-		if n == nil {
-			node, err := p2p.NewNode(token)
-			if err != nil {
-				return err
-			}
-			err = node.Start(ctx)
-			if err != nil {
-				return fmt.Errorf("starting new node: %w", err)
-			}
-			n = node
-		}
-
-		// Attach a ServiceDiscoverer to the p2p node
-		log.Info().Msg("Starting P2P server discovery...")
-		if err := p2p.ServiceDiscoverer(ctx, n, token, p2p.NetworkID(networkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
-			var tunnelAddresses []string
-			for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(networkID, p2p.WorkerID)) {
-				if v.IsOnline() {
-					tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
-				} else {
-					log.Info().Msgf("Node %s is offline", v.ID)
-				}
-			}
-			tunnelEnvVar := strings.Join(tunnelAddresses, ",")
-
-			os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
-			log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
-		}, true); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -8,13 +8,12 @@ import (
 var CLI struct {
 	cliContext.Context `embed:""`

-	Run             RunCMD             `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
-	Federated       FederatedCLI       `cmd:"" help:"Run LocalAI in federated mode"`
-	Models          ModelsCMD          `cmd:"" help:"Manage LocalAI models and definitions"`
-	TTS             TTSCMD             `cmd:"" help:"Convert text to speech"`
-	SoundGeneration SoundGenerationCMD `cmd:"" help:"Generates audio files from text or audio"`
-	Transcript      TranscriptCMD      `cmd:"" help:"Convert audio to text"`
-	Worker          worker.Worker      `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
-	Util            UtilCMD            `cmd:"" help:"Utility commands"`
-	Explorer        ExplorerCMD        `cmd:"" help:"Run p2p explorer"`
+	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Federated  FederatedCLI  `cmd:"" help:"Run LocalAI in federated mode"`
+	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
+	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+	Worker     worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
+	Util       UtilCMD       `cmd:"" help:"Utility commands"`
+	Explorer   ExplorerCMD   `cmd:"" help:"Run p2p explorer"`
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -3,10 +3,11 @@ package cli
 import (
 	"context"
 	"fmt"
+	"net"
+	"os"
 	"strings"
 	"time"

-	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
@@ -52,8 +53,6 @@ type RunCMD struct {
 	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
 	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
 	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
-	Peer2PeerDHTInterval   int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
-	Peer2PeerOTPInterval   int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
 	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
 	Peer2PeerNetworkID     string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
 	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
@@ -108,7 +107,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			// IF no token is provided, and p2p is enabled,
 			// we generate one and wait for the user to pick up the token (this is for interactive)
 			log.Info().Msg("No token provided, generating one")
-			token = p2p.GenerateToken(r.Peer2PeerDHTInterval, r.Peer2PeerOTPInterval)
+			token = p2p.GenerateToken()
 			log.Info().Msg("Generated Token:")
 			fmt.Println(token)

@@ -116,12 +115,52 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
 		}
 		opts = append(opts, config.WithP2PToken(token))
+
+		node, err := p2p.NewNode(token)
+		if err != nil {
+			return err
+		}
+		nodeContext := context.Background()
+
+		err = node.Start(nodeContext)
+		if err != nil {
+			return fmt.Errorf("starting new node: %w", err)
+		}
+
+		log.Info().Msg("Starting P2P server discovery...")
+		if err := p2p.ServiceDiscoverer(nodeContext, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
+			var tunnelAddresses []string
+			for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID)) {
+				if v.IsOnline() {
+					tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
+				} else {
+					log.Info().Msgf("Node %s is offline", v.ID)
+				}
+			}
+			tunnelEnvVar := strings.Join(tunnelAddresses, ",")
+
+			os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
+			log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
+		}, true); err != nil {
+			return err
+		}
 	}

-	backgroundCtx := context.Background()
+	if r.Federated {
+		_, port, err := net.SplitHostPort(r.Address)
+		if err != nil {
+			return err
+		}
+		fedCtx := context.Background()

-	if err := cli_api.StartP2PStack(backgroundCtx, r.Address, token, r.Peer2PeerNetworkID, r.Federated); err != nil {
-		return err
+		node, err := p2p.ExposeService(fedCtx, "localhost", port, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID))
+		if err != nil {
+			return err
+		}
+
+		if err := p2p.ServiceDiscoverer(fedCtx, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID), nil, false); err != nil {
+			return err
+		}
 	}

 	idleWatchDog := r.EnableWatchdogIdle
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -1,110 +0,0 @@
-package cli
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/backend"
-	cliContext "github.com/mudler/LocalAI/core/cli/context"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/rs/zerolog/log"
-)
-
-type SoundGenerationCMD struct {
-	Text []string `arg:""`
-
-	Backend                string   `short:"b" required:"" help:"Backend to run the SoundGeneration model"`
-	Model                  string   `short:"m" required:"" help:"Model name to run the SoundGeneration"`
-	Duration               string   `short:"d" help:"If specified, the length of audio to generate in seconds"`
-	Temperature            string   `short:"t" help:"If specified, the temperature of the generation"`
-	InputFile              string   `short:"i" help:"If specified, the input file to condition generation upon"`
-	InputFileSampleDivisor string   `short:"f" help:"If InputFile and this divisor is specified, the first portion of the sample file will be used"`
-	DoSample               bool     `short:"s" default:"true" help:"Enables sampling from the model. Better quality at the cost of speed. Defaults to enabled."`
-	OutputFile             string   `short:"o" type:"path" help:"The path to write the output wav file"`
-	ModelsPath             string   `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
-	BackendAssetsPath      string   `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
-}
-
-func parseToFloat32Ptr(input string) *float32 {
-	f, err := strconv.ParseFloat(input, 32)
-	if err != nil {
-		return nil
-	}
-	f2 := float32(f)
-	return &f2
-}
-
-func parseToInt32Ptr(input string) *int32 {
-	i, err := strconv.ParseInt(input, 10, 32)
-	if err != nil {
-		return nil
-	}
-	i2 := int32(i)
-	return &i2
-}
-
-func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
-	outputFile := t.OutputFile
-	outputDir := t.BackendAssetsPath
-	if outputFile != "" {
-		outputDir = filepath.Dir(outputFile)
-	}
-
-	text := strings.Join(t.Text, " ")
-
-	externalBackends := make(map[string]string)
-	// split ":" to get backend name and the uri
-	for _, v := range t.ExternalGRPCBackends {
-		backend := v[:strings.IndexByte(v, ':')]
-		uri := v[strings.IndexByte(v, ':')+1:]
-		externalBackends[backend] = uri
-		fmt.Printf("TMP externalBackends[%q]=%q\n\n", backend, uri)
-	}
-
-	opts := &config.ApplicationConfig{
-		ModelPath:            t.ModelsPath,
-		Context:              context.Background(),
-		AudioDir:             outputDir,
-		AssetsDestination:    t.BackendAssetsPath,
-		ExternalGRPCBackends: externalBackends,
-	}
-	ml := model.NewModelLoader(opts.ModelPath)
-
-	defer func() {
-		err := ml.StopAllGRPC()
-		if err != nil {
-			log.Error().Err(err).Msg("unable to stop all grpc processes")
-		}
-	}()
-
-	options := config.BackendConfig{}
-	options.SetDefaults()
-
-	var inputFile *string
-	if t.InputFile != "" {
-		inputFile = &t.InputFile
-	}
-
-	filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
-		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
-		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
-
-	if err != nil {
-		return err
-	}
-	if outputFile != "" {
-		if err := os.Rename(filePath, outputFile); err != nil {
-			return err
-		}
-		fmt.Printf("Generate file %s\n", outputFile)
-	} else {
-		fmt.Printf("Generate file %s\n", filePath)
-	}
-	return nil
-}
--- a/core/cli/worker/worker.go
+++ b/core/cli/worker/worker.go
@@ -2,7 +2,6 @@ package worker

 type WorkerFlags struct {
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ExtraLLamaCPPArgs string `name:"llama-cpp-args" env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
 }

 type Worker struct {
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -3,7 +3,6 @@ package worker
 import (
 	"fmt"
 	"os"
-	"strings"
 	"syscall"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
@@ -13,6 +12,7 @@ import (
 )

 type LLamaCPP struct {
+	Args        []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
 	WorkerFlags `embed:""`
 }

@@ -34,8 +34,9 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
 		"llama-cpp-rpc-server",
 	)

-	args := strings.Split(r.ExtraLLamaCPPArgs, " ")
+	args := os.Args[4:]
 	args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
+
 	args = append([]string{grpcProcess}, args...)
 	return syscall.Exec(
 		grpcProcess,
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
-	"strings"
 	"time"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
@@ -21,11 +20,12 @@ import (

 type P2P struct {
 	WorkerFlags        `embed:""`
-	Token              string `env:"LOCALAI_TOKEN,LOCALAI_P2P_TOKEN,TOKEN" help:"P2P token to use"`
-	NoRunner           bool   `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
-	RunnerAddress      string `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
-	RunnerPort         string `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
-	Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
+	Token              string   `env:"LOCALAI_TOKEN,LOCALAI_P2P_TOKEN,TOKEN" help:"P2P token to use"`
+	NoRunner           bool     `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
+	RunnerAddress      string   `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
+	RunnerPort         string   `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
+	ExtraLLamaCPPArgs  []string `env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
+	Peer2PeerNetworkID string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
 }

 func (r *P2P) Run(ctx *cliContext.Context) error {
@@ -76,8 +76,8 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 					"util",
 					"llama-cpp-rpc-server",
 				)
-				extraArgs := strings.Split(r.ExtraLLamaCPPArgs, " ")
-				args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, extraArgs...)
+
+				args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, r.ExtraLLamaCPPArgs...)
 				args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)

 				cmd := exec.Command(
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -126,7 +126,6 @@ type LLMConfig struct {
 	Grammar         string   `yaml:"grammar"`
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
-	ExtractRegex    []string `yaml:"extract_regex"`
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -772,17 +772,6 @@ var _ = Describe("API test", func() {
 			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error:"))
 		})

-		It("shows the external backend", func() {
-			// do an http request to the /system endpoint
-			resp, err := http.Get("http://127.0.0.1:9090/system")
-			Expect(err).ToNot(HaveOccurred())
-			Expect(resp.StatusCode).To(Equal(200))
-			dat, err := io.ReadAll(resp.Body)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(string(dat)).To(ContainSubstring("huggingface"))
-			Expect(string(dat)).To(ContainSubstring("llama-cpp"))
-		})
-
 		It("transcribes audio", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test supported only on linux")
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -1,65 +0,0 @@
-package elevenlabs
-
-import (
-	"github.com/gofiber/fiber/v2"
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/rs/zerolog/log"
-)
-
-// SoundGenerationEndpoint is the ElevenLabs SoundGeneration endpoint https://elevenlabs.io/docs/api-reference/sound-generation
-// @Summary Generates audio from the input text.
-// @Param request body schema.ElevenLabsSoundGenerationRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/sound-generation [post]
-func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(schema.ElevenLabsSoundGenerationRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Str("ModelID", input.ModelID).Msg("Model not found in context")
-		}
-
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Str("Request ModelID", input.ModelID).Err(err).Msg("error during LoadBackendConfigFileByName, using request ModelID")
-		} else {
-			if input.ModelID != "" {
-				modelFile = input.ModelID
-			} else {
-				modelFile = cfg.Model
-			}
-		}
-		log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
-
-		if input.Duration != nil {
-			log.Debug().Float32("duration", *input.Duration).Msg("duration set")
-		}
-		if input.Temperature != nil {
-			log.Debug().Float32("temperature", *input.Temperature).Msg("temperature set")
-		}
-
-		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
-
-	}
-}
--- a/core/http/endpoints/localai/system.go
+++ b/core/http/endpoints/localai/system.go
@@ -1,29 +0,0 @@
-package localai
-
-import (
-	"github.com/gofiber/fiber/v2"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-)
-
-// SystemInformations returns the system informations
-// @Summary Show the LocalAI instance information
-// @Success 200 {object} schema.SystemInformationResponse "Response"
-// @Router /system [get]
-func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(*fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		availableBackends, err := ml.ListAvailableBackends(appConfig.AssetsDestination)
-		if err != nil {
-			return err
-		}
-		for b := range appConfig.ExternalGRPCBackends {
-			availableBackends = append(availableBackends, b)
-		}
-		return c.JSON(
-			schema.SystemInformationResponse{
-				Backends: availableBackends,
-			},
-		)
-	}
-}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -25,8 +25,9 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
 func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	var id, textContentToReturn string
-	var created int
+	textContentToReturn := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())

 	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
@@ -68,9 +69,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup

 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
 		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
-		functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
+		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
 		log.Debug().Msgf("Text content to return: %s", textContentToReturn)
-		noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0
+		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0

 		switch {
 		case noActionToRun:
@@ -83,7 +84,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			}
 			responses <- initialMessage

-			result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt)
+			result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt)
 			if err != nil {
 				log.Error().Err(err).Msg("error handling question")
 				return
@@ -105,7 +106,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			responses <- resp

 		default:
-			for i, ss := range functionResults {
+			for i, ss := range results {
 				name, args := ss.Name, ss.Arguments

 				initialMessage := schema.OpenAIResponse{
@@ -158,10 +159,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 	}

 	return func(c *fiber.Ctx) error {
-		textContentToReturn = ""
-		id = uuid.New().String()
-		created = int(time.Now().Unix())
-
 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -16,6 +16,4 @@ func RegisterElevenLabsRoutes(app *fiber.App,
 	// Elevenlabs
 	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))

-	app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
-
 }
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -70,6 +70,4 @@ func RegisterLocalAIRoutes(app *fiber.App,
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Get("/system", auth, localai.SystemInformations(ml, appConfig))
-
 }
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmEU9fBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmEU9fBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmSU5fBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmSU5fBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmWUlfBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmWUlfBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmYUtfBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmYUtfBBc9.ttf
--- a/core/http/static/assets/KFOmCnqEu92Fr1Me5Q.ttf
+++ b/core/http/static/assets/KFOmCnqEu92Fr1Me5Q.ttf
--- a/core/http/static/assets/KFOmCnqEu92Fr1Mu4mxP.ttf
+++ b/core/http/static/assets/KFOmCnqEu92Fr1Mu4mxP.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf
--- a/core/http/static/assets/alpine.js
+++ b/core/http/static/assets/alpine.js
--- a/core/http/static/assets/highlightjs.css
+++ b/core/http/static/assets/highlightjs.css
@@ -1,9 +0,0 @@
-/*!
-  Theme: Default
-  Description: Original highlight.js style
-  Author: (c) Ivan Sagalaev <maniac@softwaremaniacs.org>
-  Maintainer: @highlightjs/core-team
-  Website: https://highlightjs.org/
-  License: see project LICENSE
-  Touched: 2021
-*/pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}.hljs{background:#f3f3f3;color:#444}.hljs-comment{color:#697070}.hljs-punctuation,.hljs-tag{color:#444a}.hljs-tag .hljs-attr,.hljs-tag .hljs-name{color:#444}.hljs-attribute,.hljs-doctag,.hljs-keyword,.hljs-meta .hljs-keyword,.hljs-name,.hljs-selector-tag{font-weight:700}.hljs-deletion,.hljs-number,.hljs-quote,.hljs-selector-class,.hljs-selector-id,.hljs-string,.hljs-template-tag,.hljs-type{color:#800}.hljs-section,.hljs-title{color:#800;font-weight:700}.hljs-link,.hljs-operator,.hljs-regexp,.hljs-selector-attr,.hljs-selector-pseudo,.hljs-symbol,.hljs-template-variable,.hljs-variable{color:#ab5656}.hljs-literal{color:#695}.hljs-addition,.hljs-built_in,.hljs-bullet,.hljs-code{color:#397300}.hljs-meta{color:#1f7199}.hljs-meta .hljs-string{color:#38a}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}
--- a/core/http/static/assets/highlightjs.js
+++ b/core/http/static/assets/highlightjs.js
--- a/core/http/static/assets/htmx.js
+++ b/core/http/static/assets/htmx.js
--- a/core/http/static/assets/marked.js
+++ b/core/http/static/assets/marked.js
--- a/core/http/static/assets/purify.js
+++ b/core/http/static/assets/purify.js
--- a/core/http/static/assets/tailwindcss.js
+++ b/core/http/static/assets/tailwindcss.js
--- a/core/http/static/assets/tw-elements.css
+++ b/core/http/static/assets/tw-elements.css
--- a/core/http/static/assets/tw-elements.js
+++ b/core/http/static/assets/tw-elements.js
--- a/core/p2p/p2p.go
+++ b/core/p2p/p2p.go
@@ -28,15 +28,9 @@ import (
 	"github.com/mudler/edgevpn/pkg/logger"
 )

-func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectionConfig {
+func generateNewConnectionData() *node.YAMLConnectionConfig {
 	maxMessSize := 20 << 20 // 20MB
 	keyLength := 43
-	if DHTInterval == 0 {
-		DHTInterval = 360
-	}
-	if OTPInterval == 0 {
-		OTPInterval = 9000
-	}

 	return &node.YAMLConnectionConfig{
 		MaxMessageSize: maxMessSize,
@@ -46,21 +40,21 @@ func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectio
 		OTP: node.OTP{
 			DHT: node.OTPConfig{
 				Key:      eutils.RandStringRunes(keyLength),
-				Interval: DHTInterval,
+				Interval: 120,
 				Length:   keyLength,
 			},
 			Crypto: node.OTPConfig{
 				Key:      eutils.RandStringRunes(keyLength),
-				Interval: OTPInterval,
+				Interval: 9000,
 				Length:   keyLength,
 			},
 		},
 	}
 }

-func GenerateToken(DHTInterval, OTPInterval int) string {
+func GenerateToken() string {
 	// Generates a new config and exit
-	return generateNewConnectionData(DHTInterval, OTPInterval).Base64()
+	return generateNewConnectionData().Base64()
 }

 func IsP2PEnabled() bool {
--- a/core/p2p/p2p_disabled.go
+++ b/core/p2p/p2p_disabled.go
@@ -10,7 +10,7 @@ import (
 	"github.com/mudler/edgevpn/pkg/node"
 )

-func GenerateToken(DHTInterval, OTPInterval int) string {
+func GenerateToken() string {
 	return "not implemented"
 }

--- a/core/schema/elevenlabs.go
+++ b/core/schema/elevenlabs.go
@@ -4,11 +4,3 @@ type ElevenLabsTTSRequest struct {
 	Text    string `json:"text" yaml:"text"`
 	ModelID string `json:"model_id" yaml:"model_id"`
 }
-
-type ElevenLabsSoundGenerationRequest struct {
-	Text        string   `json:"text" yaml:"text"`
-	ModelID     string   `json:"model_id" yaml:"model_id"`
-	Duration    *float32 `json:"duration_seconds,omitempty" yaml:"duration_seconds,omitempty"`
-	Temperature *float32 `json:"prompt_influence,omitempty" yaml:"prompt_influence,omitempty"`
-	DoSample    *bool    `json:"do_sample,omitempty" yaml:"do_sample,omitempty"`
-}
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -70,7 +70,3 @@ type P2PNodesResponse struct {
 	Nodes          []p2p.NodeData `json:"nodes" yaml:"nodes"`
 	FederatedNodes []p2p.NodeData `json:"federated_nodes" yaml:"federated_nodes"`
 }
-
-type SystemInformationResponse struct {
-	Backends []string `json:"backends"`
-}
--- a/core/services/backend_monitor.go
+++ b/core/services/backend_monitor.go
@@ -107,7 +107,7 @@ func (bms BackendMonitorService) CheckAndSample(modelName string) (*proto.Status
 		return nil, err
 	}
 	modelAddr := bms.modelLoader.CheckIsLoaded(backendId)
-	if modelAddr == nil {
+	if modelAddr == "" {
 		return nil, fmt.Errorf("backend %s is not currently loaded", backendId)
 	}

--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -133,10 +133,6 @@ Due to the nature of ROCm it is best to run all implementations in containers as
 Ongoing verification testing of ROCm compatability with integrated backends.
 Please note the following list of verified backends and devices.

-LocalAI hipblas images are built against the following targets: gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
-
-If your device is not one of these you must specify the corresponding `GPU_TARGETS` and specify `REBUILD=true`. Otherwise you don't need to specify these in the commands below.
-
 ### Verified 

 The devices in the following list have been tested with `hipblas` images running `ROCm 6.0.0`
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ettore Di Giacinto	2a03905920	Merge branch 'master' into cleanup_deps	2024-08-21 13:10:46 +02:00
Ettore Di Giacinto	35297ebc14	Drop also ttf files Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-08-21 13:03:26 +02:00
Ettore Di Giacinto	b303805df9	fix marked	2024-08-21 13:02:19 +02:00
Ettore Di Giacinto	32d51797d9	fix alpine.js	2024-08-21 13:02:19 +02:00
Ettore Di Giacinto	af09b019ed	fix(assets): generate assets on build time Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-08-21 13:02:19 +02:00