Merge branch 'master' into cleanup_deps

Drop also ttf files
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 16:51:44 -04:00 · 2024-08-21 13:10:46 +02:00 · 2024-08-21 13:03:26 +02:00 · 2024-08-21 13:02:19 +02:00 · 2024-08-21 13:02:19 +02:00 · 2024-08-21 13:02:19 +02:00
136 changed files with 1228 additions and 3701 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -32,22 +32,18 @@ config_remote() {
 }
 # Setup special .ssh files
-# Prints out lines of text to make things pretty
+#
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
    echo "starting ~/.ssh directory setup..."
    mkdir -p "${HOME}.ssh"
    chmod 0700 "${HOME}/.ssh"
    echo "-----"
    local files=("$@")
-    for file in "${files[@]}" ; do
+    for file in "${files[@]}"; then
        local cfile="/devcontainer-customization/${file}"
-        local hfile="${HOME}/.ssh/${file}"
+        local hfile="~/.ssh/${file}"
        if [ ! -f "${hfile}" ]; then
-            echo "copying \"${file}\""
+            echo "copying ${file}"
            cp "${cfile}" "${hfile}"
            chmod 600 "${hfile}"
        fi
    done
-    echo "~/.ssh directory setup complete!"
+    ls ~/.ssh
 }
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -56,7 +56,7 @@ jobs:
          rm -rfv ${{ matrix.variable }}_message.txt
          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -36,7 +36,7 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -294,7 +294,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -327,7 +327,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          brew install protobuf grpc libomp llvm
+          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
@@ -336,7 +336,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export CC=/opt/homebrew/opt/llvm/bin/clang
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -214,13 +214,12 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -25,7 +25,7 @@ jobs:
        run: |
          make protogen-go swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/39
+++ b/39
@@ -13,7 +13,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 RUN apt-get update && \
@@ -263,20 +263,14 @@ EOT
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd
-# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
+COPY . .
-COPY Makefile .
+COPY .git .
 COPY go.mod .
 COPY go.sum .
 COPY backend/backend.proto ./backend/backend.proto
 COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
 COPY pkg/grpc ./pkg/grpc
 COPY pkg/stablediffusion ./pkg/stablediffusion
 RUN git init
 RUN make sources/go-stable-diffusion
 RUN touch prepare-sources
-# Actually build the backend
+RUN make prepare
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
+
 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
 ###################################
 ###################################
@@ -291,20 +285,8 @@ COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build
 COPY . .
 COPY .git .
 RUN make prepare
 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
+RUN make build
 ## We only leave the most CPU-optimized variant and the fallback for the cublas build
 ## (both will use CUDA for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -418,6 +400,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
--- a/21
+++ b/21
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=2f3c1466ff46a2413b0e363a5005c46538186ee6
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=d65786ea540a5aef21f67cacfa6f134097727780
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -338,7 +338,7 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
-prepare: prepare-sources $(OPTIONAL_TARGETS)
+prepare: prepare-sources gen-assets $(OPTIONAL_TARGETS)
 clean: ## Remove build related file
 	$(GOCMD) clean -cache
@@ -534,10 +534,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -571,6 +571,14 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: exllama-protogen
 exllama-protogen:
 	$(MAKE) -C backend/python/exllama protogen
 .PHONY: exllama-protogen-clean
 exllama-protogen-clean:
 	$(MAKE) -C backend/python/exllama protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -667,6 +675,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
@@ -837,7 +846,7 @@ endif
 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
@@ -72,7 +72,6 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,7 +16,6 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
@@ -271,17 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
  optional float duration = 4;
  optional float temperature = 5;
  optional bool sample = 6;
  optional string src = 7;
  optional int32 src_divisor = 8;
 }
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -17,10 +17,11 @@
 #include "common.h"
 #include "json.hpp"
 #include "llama.h"
 #include "grammar-parser.h"
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
-#include "sampling.h"
+
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -202,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;
    // sampling
-    struct gpt_sampler_params sparams;
+    struct llama_sampling_params sparams;
-    gpt_sampler *ctx_sampling = nullptr;
+    llama_sampling_context *ctx_sampling = nullptr;
    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -618,7 +619,7 @@ struct llama_server_context
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        gpt_sampler_params default_sparams;
+        llama_sampling_params default_sparams;
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -627,7 +628,7 @@ struct llama_server_context
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
+        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -640,7 +641,7 @@ struct llama_server_context
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
+        slot->params.seed               = json_value(data, "seed",              default_params.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
@@ -664,7 +665,6 @@ struct llama_server_context
            slot->params.input_prefix = "";
        }
        if (data.count("input_suffix") != 0)
        {
            slot->params.input_suffix = data["input_suffix"];
@@ -683,10 +683,6 @@ struct llama_server_context
            slot->prompt = "";
        }
        if (json_value(data, "ignore_eos", false)) {
                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
        slot->sparams.use_penalty_prompt_tokens = false;
        const auto &penalty_prompt = data.find("penalty_prompt");
@@ -722,10 +718,14 @@ struct llama_server_context
                slot->sparams.use_penalty_prompt_tokens = true;
            }
        }
      */
        slot->sparams.logit_bias.clear();
        if (json_value(data, "ignore_eos", false))
        {
            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
        }
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
@@ -753,7 +753,7 @@ struct llama_server_context
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                    else if (el[0].is_string())
@@ -761,13 +761,13 @@ struct llama_server_context
                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                }
            }
        }
-        
+
        slot->params.antiprompt.clear();
        const auto &stop = data.find("stop");
@@ -781,22 +781,24 @@ struct llama_server_context
                }
            }
        }
-        
+
-        const auto & samplers = data.find("samplers");
+        const auto &samplers_sequence = data.find("samplers");
-        if (samplers != data.end() && samplers->is_array()) {
+        if (samplers_sequence != data.end() && samplers_sequence->is_array())
        {
            std::vector<std::string> sampler_names;
-                for (const auto & name : *samplers) {
+            for (const auto &sampler_name : *samplers_sequence)
-                    if (name.is_string()) {
+            {
-                        sampler_names.emplace_back(name);
+                if (sampler_name.is_string())
-                    }
+                {
                    sampler_names.emplace_back(sampler_name);
                }
-                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+            }
            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
-                slot->sparams.samplers = default_sparams.samplers;
+            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
        }
        if (multimodal)
        {
@@ -873,10 +875,10 @@ struct llama_server_context
        if (slot->ctx_sampling != nullptr)
        {
-            gpt_sampler_free(slot->ctx_sampling);
+            llama_sampling_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
+        slot->ctx_sampling = llama_sampling_init(slot->sparams);
-        //llama_set_rng_seed(ctx, slot->params.seed);
+        llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;
        all_slots_are_idle = false;
@@ -886,7 +888,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });
-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
        return true;
    }
@@ -1004,13 +1006,11 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;
 /*
        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
        {
            // we can change penalty_prompt_tokens because it is always created from scratch each request
            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
        }
        */
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
@@ -1119,7 +1119,7 @@ struct llama_server_context
                continue;
            }
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG_TEE("Error processing the given image");
                return false;
            }
@@ -1144,11 +1144,13 @@ struct llama_server_context
    json get_formated_generation(llama_client_slot &slot)
    {
-        std::vector<std::string> samplers;
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
-        samplers.reserve(slot.sparams.samplers.size());
+        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-        for (const auto & sampler : slot.sparams.samplers)
+                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
        std::vector<std::string> samplers_sequence;
        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }
        return json {
@@ -1163,11 +1165,13 @@ struct llama_server_context
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typ_p},
+            {"typical_p",         slot.sparams.typical_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1175,13 +1179,13 @@ struct llama_server_context
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
-            {"ignore_eos",        slot.sparams.ignore_eos},
+            {"ignore_eos",        ignore_eos},
            {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+            {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers}
+            {"samplers",          samplers_sequence}
        };
    }
@@ -1710,7 +1714,7 @@ struct llama_server_context
                    if (!slot.params.cache_prompt)
                    {
-                        gpt_sampler_reset(slot.ctx_sampling);
+                        llama_sampling_reset(slot.ctx_sampling);
                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1722,7 +1726,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            gpt_sampler_accept(slot.ctx_sampling, token, false);
+                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
                        }
                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1930,9 +1934,9 @@ struct llama_server_context
                }
                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
-                gpt_sampler_accept(slot.ctx_sampling, id, true);
+                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1942,14 +1946,19 @@ struct llama_server_context
                    metrics.on_prompt_eval(slot);
                }
                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;
                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                const int32_t n_probs = slot.sparams.n_probs;
-                    result.probs.push_back({
+                if (slot.sparams.temp <= 0 && n_probs > 0)
-                        cur_p->data[i].id,
+                {
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    // for llama_sample_token_greedy we need to sort candidates
-                    });
+                    llama_sample_softmax(ctx, &cur_p);
                }
                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
                {
                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
                }
                if (!process_token(result, slot))
@@ -2201,7 +2210,7 @@ static void params_parse(const backend::ModelOptions* request,
    params.model_alias =  request->modelfile();
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
-    params.cpuparams.n_threads = request->threads();
+    params.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +0,0 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 342042ff..224db9b5 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
             int* patches_data = (int*)malloc(ggml_nbytes(patches));
             for (int i = 0; i < num_patches; i++) {
 -                patches_data[i] = i + 1;
 +                patches_data[i] = i;
             }
             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
             free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,12 +1,5 @@
 #!/bin/bash
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
    echo "Applying patch $patch"
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,4 +480,31 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -0,0 +1,104 @@
 package main
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/core/schema"
 )
 func ffmpegCommand(args []string) (string, error) {
 	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
 }
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
 	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
 	out, err := ffmpegCommand(commandArgs)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
 	return nil
 }
 func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
 	res := schema.TranscriptionResult{}
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return res, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := audioToWav(audiopath, convertedPath); err != nil {
 		return res, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return res, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return res, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := model.NewContext()
 	if err != nil {
 		return res, err
 	}
 	context.SetThreads(threads)
 	if language != "" {
 		context.SetLanguage(language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return res, err
 	}
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int
 		for _, t := range s.Tokens {
 			tokens = append(tokens, t.Id)
 		}
 		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
 		res.Segments = append(res.Segments, segment)
 		res.Text += s.Text
 	}
 	return res, nil
 }
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -0,0 +1,26 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
 }
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -1,105 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"os"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := sd.whisper.NewContext()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	context.SetThreads(uint(opts.Threads))
 	if opts.Language != "" {
 		context.SetLanguage(opts.Language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if opts.Translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	segments := []*pb.TranscriptSegment{}
 	text := ""
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int32
 		for _, t := range s.Tokens {
 			tokens = append(tokens, int32(t.Id))
 		}
 		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
 		segments = append(segments, segment)
 		text += s.Text
 	}
 	return pb.TranscriptResult{
 		Segments: segments,
 		Text:     text,
 	}, nil
 }
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 TTS==0.22.0
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -168,7 +168,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.CFGScale != 0:
                self.cfg_scale = request.CFGScale
-            clipmodel = "Lykon/dreamshaper-8"
+            clipmodel = "runwayml/stable-diffusion-v1-5"
            if request.CLIPModel != "":
                clipmodel = request.CLIPModel
            clipsubfolder = "text_encoder"
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.65.4
 pillow
 protobuf
 certifi
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -53,7 +53,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
@@ -71,7 +71,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                print(response.message)
                self.assertTrue(response.success)
                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
@@ -81,4 +81,4 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Image gen service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
--- a/backend/python/exllama/.gitignore
+++ b/backend/python/exllama/.gitignore
@@ -0,0 +1 @@
 source
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -0,0 +1,25 @@
 export CONDA_ENV_PATH = "exllama.yml"
 .PHONY: exllama
 exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}
 .PHONY: run
 run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv source __pycache__
--- a/backend/python/exllama/README.md
+++ b/backend/python/exllama/README.md
@@ -0,0 +1,5 @@
 # Creating a separate environment for the exllama project
 ```
 make exllama
 ```
--- a/backend/python/exllama/backend.py
+++ b/backend/python/exllama/backend.py
@@ -0,0 +1,159 @@
 #!/usr/bin/env python3
 import grpc
 from concurrent import futures
 import time
 import backend_pb2
 import backend_pb2_grpc
 import argparse
 import signal
 import sys
 import os, glob
 from pathlib import Path
 import torch
 import torch.nn.functional as F
 from torch import version as torch_version
 from source.tokenizer import ExLlamaTokenizer
 from source.generator import ExLlamaGenerator
 from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def generate(self,prompt, max_new_tokens):
        self.generator.end_beam_search()
        # Tokenizing the input
        ids = self.generator.tokenizer.encode(prompt)
        self.generator.gen_begin_reuse(ids)
        initial_len = self.generator.sequence[0].shape[0]
        has_leading_space = False
        decoded_text = ''
        for i in range(max_new_tokens):
            token = self.generator.gen_single_token()
            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
                has_leading_space = True
            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
            if has_leading_space:
                decoded_text = ' ' + decoded_text
            if token.item() == self.generator.tokenizer.eos_token_id:
                break
        return decoded_text
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
            model_directory = request.ModelFile
            # Locate files we need within that directory
            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
            model_config_path = os.path.join(model_directory, "config.json")
            st_pattern = os.path.join(model_directory, "*.safetensors")
            model_path = glob.glob(st_pattern)[0]
            # Create config, model, tokenizer and generator
            config = ExLlamaConfig(model_config_path)               # create config from config.json
            config.model_path = model_path                          # supply path to model weights file
            if (request.ContextSize):
                config.max_seq_len = request.ContextSize            # override max sequence length
                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
            # Set Rope scaling.
            if (request.RopeFreqScale):
                # Alpha value for Rope scaling. 
                # Higher value increases context but adds perplexity.
                # alpha_value and compress_pos_emb are mutually exclusive.
                # https://github.com/turboderp/exllama/issues/115
                config.alpha_value = request.RopeFreqScale
                config.calculate_rotary_embedding_base()
            model = ExLlama(config)                                 # create ExLlama instance and load the weights
            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
            self.generator= generator
            self.model = model
            self.tokenizer = tokenizer
            self.cache = cache
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.15
        if request.Penalty != 0.0:
            penalty = request.Penalty
        self.generator.settings.token_repetition_penalty_max = penalty
        self.generator.settings.temperature = request.Temperature
        self.generator.settings.top_k = request.TopK
        self.generator.settings.top_p = request.TopP
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        if self.cache.batch_size == 1:
            del self.cache
            self.cache = ExLlamaCache(self.model, batch_size=2)
            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
        t = self.generate(request.Prompt, tokens)
        # Remove prompt from response if present
        if request.Prompt in t:
            t = t.replace(request.Prompt, "")
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 set -e
 LIMIT_TARGETS="cublas"
 source $(dirname $0)/../common/libbackend.sh
 installRequirements
 git clone https://github.com/turboderp/exllama $MY_DIR/source
 uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
 cp -v ./*py $MY_DIR/source/
--- a/backend/python/exllama/requirements-cpu.txt
+++ b/backend/python/exllama/requirements-cpu.txt
@@ -0,0 +1,3 @@
 transformers
 accelerate
 torch
--- a/backend/python/exllama/requirements-cublas11.txt
+++ b/backend/python/exllama/requirements-cublas11.txt
@@ -0,0 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 transformers
 accelerate
--- a/backend/python/exllama/requirements-cublas12.txt
+++ b/backend/python/exllama/requirements-cublas12.txt
@@ -0,0 +1,3 @@
 torch
 transformers
 accelerate
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -0,0 +1,4 @@
 grpcio==1.65.5
 protobuf
 certifi
 setuptools
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 LIMIT_TARGETS="cublas"
 BACKEND_FILE="${MY_DIR}/source/backend.py"
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/exllama/test.sh
+++ b/backend/python/exllama/test.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 runUnittests
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.3.0+rocm6.0
+torch
-torchaudio==2.3.0+rocm6.0
+torchaudio
 transformers
-accelerate
+accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.4
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/transformers-musicgen/backend.py
+++ b/backend/python/transformers-musicgen/backend.py
@@ -15,7 +15,7 @@ import backend_pb2_grpc
 import grpc
-from scipy.io import wavfile
+from scipy.io.wavfile import write as write_wav
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -63,61 +63,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def SoundGeneration(self, request, context):
        model_name = request.model
        if model_name == "":
            return backend_pb2.Result(success=False, message="request.model is required")
        try:
            self.processor = AutoProcessor.from_pretrained(model_name)
            self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
            inputs = None
            if request.text == "":
                inputs = self.model.get_unconditional_inputs(num_samples=1)
            elif request.HasField('src'):
                # TODO SECURITY CODE GOES HERE LOL
                # WHO KNOWS IF THIS WORKS???
                sample_rate, wsamples = wavfile.read('path_to_your_file.wav')
                if request.HasField('src_divisor'):
                    wsamples = wsamples[: len(wsamples) // request.src_divisor]
                inputs = self.processor(
                    audio=wsamples,
                    sampling_rate=sample_rate,
                    text=[request.text],
                    padding=True,
                    return_tensors="pt",
                )
            else:
                inputs = self.processor(
                    text=[request.text],
                    padding=True,
                    return_tensors="pt",
                )
            tokens = 256
            if request.HasField('duration'):
                tokens = int(request.duration * 51.2) # 256 tokens = 5 seconds, therefore 51.2 tokens is one second
            guidance = 3.0
            if request.HasField('temperature'):
                guidance = request.temperature
            dosample = True
            if request.HasField('sample'):
                dosample = request.sample
            audio_values = self.model.generate(**inputs, do_sample=dosample, guidance_scale=guidance, max_new_tokens=tokens)
            print("[transformers-musicgen] SoundGeneration generated!", file=sys.stderr)
            sampling_rate = self.model.config.audio_encoder.sampling_rate
            wavfile.write(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
            print("[transformers-musicgen] SoundGeneration saved to", request.dst, file=sys.stderr)
            print("[transformers-musicgen] SoundGeneration for", file=sys.stderr)
            print("[transformers-musicgen] SoundGeneration requested tokens", tokens, file=sys.stderr)
            print(request, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
 # The TTS endpoint is older, and provides fewer features, but exists for compatibility reasons
    def TTS(self, request, context):
        model_name = request.model
        if model_name == "":
@@ -130,7 +75,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                padding=True,
                return_tensors="pt",
            )
-            tokens = 512 # No good place to set the "length" in TTS, so use 10s as a sane default
+            tokens = 256
            # TODO get tokens from request?
            audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
            print("[transformers-musicgen] TTS generated!", file=sys.stderr)
            sampling_rate = self.model.config.audio_encoder.sampling_rate
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers-musicgen/test.py
+++ b/backend/python/transformers-musicgen/test.py
@@ -63,7 +63,7 @@ class TestBackendServicer(unittest.TestCase):
    def test_tts(self):
        """
-        This method tests if TTS is generated successfully
+        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
@@ -77,24 +77,5 @@ class TestBackendServicer(unittest.TestCase):
        except Exception as err:
            print(err)
            self.fail("TTS service failed")
        finally:
            self.tearDown()
    def test_sound_generation(self):
        """
        This method tests if SoundGeneration is generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
                self.assertTrue(response.success)
                sg_request = backend_pb2.SoundGenerationRequest(text="80s TV news production music hit for tonight's biggest story")
                sg_response = stub.SoundGeneration(sg_request)
                self.assertIsNotNone(sg_response)
        except Exception as err:
            print(err)
            self.fail("SoundGeneration service failed")
        finally:
            self.tearDown()
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -135,26 +135,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        res = await gen.__anext__()
        return res
    def Embedding(self, request, context):
        """
        A gRPC method that calculates embeddings for a given sentence.
        Args:
            request: An EmbeddingRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            An EmbeddingResult object that contains the calculated embeddings.
        """
        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
        outputs = self.model.encode(request.Embeddings)
        # Check if we have one result at least
        if len(outputs) == 0:
            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
            context.set_details("No embeddings were calculated.")
            return backend_pb2.EmbeddingResult()
        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
    async def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.5
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -72,28 +72,5 @@ class TestBackendServicer(unittest.TestCase):
        except Exception as err:
            print(err)
            self.fail("text service failed")
        finally:
            self.tearDown()
    def test_embedding(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
                self.assertTrue(response.success)
                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
                embedding_response = stub.Embedding(embedding_request)
                self.assertIsNotNone(embedding_response.embeddings)
                # assert that is a list of floats
                self.assertIsInstance(embedding_response.embeddings, list)
                # assert that the list is not empty
                self.assertTrue(len(embedding_response.embeddings) > 0)
        except Exception as err:
            print(err)
            self.fail("Embedding service failed")
        finally:
            self.tearDown()
--- a/core/backend/backend_suite_test.go
+++ b/core/backend/backend_suite_test.go
@@ -1,13 +0,0 @@
 package backend_test
 import (
 	"testing"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 func TestBackend(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "Backend test suite")
 }
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -9,8 +9,6 @@ import (
 	"sync"
 	"unicode/utf8"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
@@ -89,7 +87,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
+				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}
@@ -183,37 +181,13 @@ func Finetune(config config.BackendConfig, input, prediction string) string {
 		mu.Lock()
 		reg, ok := cutstrings[c]
 		if !ok {
-			r, err := regexp.Compile(c)
+			cutstrings[c] = regexp.MustCompile(c)
 			if err != nil {
 				log.Fatal().Err(err).Msg("failed to compile regex")
 			}
 			cutstrings[c] = r
 			reg = cutstrings[c]
 		}
 		mu.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}
 	// extract results from the response which can be for instance inside XML tags
 	var predResult string
 	for _, r := range config.ExtractRegex {
 		mu.Lock()
 		reg, ok := cutstrings[r]
 		if !ok {
 			regex, err := regexp.Compile(r)
 			if err != nil {
 				log.Fatal().Err(err).Msg("failed to compile regex")
 			}
 			cutstrings[r] = regex
 			reg = regex
 		}
 		mu.Unlock()
 		predResult += reg.FindString(prediction)
 	}
 	if predResult != "" {
 		prediction = predResult
 	}
 	for _, c := range config.TrimSpace {
 		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
 	}
--- a/core/backend/llm_test.go
+++ b/core/backend/llm_test.go
@@ -1,109 +0,0 @@
 package backend_test
 import (
 	. "github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("LLM tests", func() {
 	Context("Finetune LLM output", func() {
 		var (
 			testConfig config.BackendConfig
 			input      string
 			prediction string
 			result     string
 		)
 		BeforeEach(func() {
 			testConfig = config.BackendConfig{
 				PredictionOptions: schema.PredictionOptions{
 					Echo: false,
 				},
 				LLMConfig: config.LLMConfig{
 					Cutstrings:   []string{`<.*?>`},                  // Example regex for removing XML tags
 					ExtractRegex: []string{`<result>(.*?)</result>`}, // Example regex to extract from tags
 					TrimSpace:    []string{" ", "\n"},
 					TrimSuffix:   []string{".", "!"},
 				},
 			}
 		})
 		Context("when echo is enabled", func() {
 			BeforeEach(func() {
 				testConfig.Echo = true
 				input = "Hello"
 				prediction = "World"
 			})
 			It("should prepend input to prediction", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("HelloWorld"))
 			})
 		})
 		Context("when echo is disabled", func() {
 			BeforeEach(func() {
 				testConfig.Echo = false
 				input = "Hello"
 				prediction = "World"
 			})
 			It("should not modify the prediction with input", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("World"))
 			})
 		})
 		Context("when cutstrings regex is applied", func() {
 			BeforeEach(func() {
 				input = ""
 				prediction = "<div>Hello</div> World"
 			})
 			It("should remove substrings matching cutstrings regex", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("Hello World"))
 			})
 		})
 		Context("when extract regex is applied", func() {
 			BeforeEach(func() {
 				input = ""
 				prediction = "<response><result>42</result></response>"
 			})
 			It("should extract substrings matching the extract regex", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("42"))
 			})
 		})
 		Context("when trimming spaces", func() {
 			BeforeEach(func() {
 				input = ""
 				prediction = "   Hello World   "
 			})
 			It("should trim spaces from the prediction", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("Hello World"))
 			})
 		})
 		Context("when trimming suffixes", func() {
 			BeforeEach(func() {
 				input = ""
 				prediction = "Hello World."
 			})
 			It("should trim suffixes from the prediction", func() {
 				result = Finetune(testConfig, input, prediction)
 				Expect(result).To(Equal("Hello World"))
 			})
 		})
 	})
 })
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -1,74 +0,0 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 func SoundGeneration(
 	backend string,
 	modelFile string,
 	text string,
 	duration *float32,
 	temperature *float32,
 	doSample *bool,
 	sourceFile *string,
 	sourceDivisor *int32,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
 	if backend == "" {
 		return "", nil, fmt.Errorf("backend is a required parameter")
 	}
 	grpcOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(backend),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	soundGenModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
 	}
 	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
 	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
 	filePath := filepath.Join(appConfig.AudioDir, fileName)
 	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
 		Text:        text,
 		Model:       modelFile,
 		Dst:         filePath,
 		Sample:      doSample,
 		Duration:    duration,
 		Temperature: temperature,
 		Src:         sourceFile,
 		SrcDivisor:  sourceDivisor,
 	})
 	// return RPC error if any
 	if !res.Success {
 		return "", nil, fmt.Errorf(res.Message)
 	}
 	return filePath, res, err
 }
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -3,13 +3,12 @@ package backend
 import (
 	"context"
 	"fmt"
 	"time"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 )
 func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
@@ -22,40 +21,19 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 		model.WithAssetDir(appConfig.AssetsDestination),
 	})
-	transcriptionModel, err := ml.BackendLoader(opts...)
+	whisperModel, err := ml.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
-	if transcriptionModel == nil {
+	if whisperModel == nil {
-		return nil, fmt.Errorf("could not load transcription model")
+		return nil, fmt.Errorf("could not load whisper model")
 	}
-	r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
+	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
 		Dst:       audio,
 		Language:  language,
 		Translate: translate,
 		Threads:   uint32(*backendConfig.Threads),
 	})
 	if err != nil {
 		return nil, err
 	}
 	tr := &schema.TranscriptionResult{
 		Text: r.Text,
 	}
 	for _, s := range r.Segments {
 		var tks []int
 		for _, t := range s.Tokens {
 			tks = append(tks, int(t))
 		}
 		tr.Segments = append(tr.Segments,
 			schema.Segment{
 				Text:   s.Text,
 				Id:     int(s.Id),
 				Start:  time.Duration(s.Start),
 				End:    time.Duration(s.End),
 				Tokens: tks,
 			})
 	}
 	return tr, err
 }
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -9,15 +9,31 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 func generateUniqueFileName(dir, baseName, ext string) string {
 	counter := 1
 	fileName := baseName + ext
 	for {
 		filePath := filepath.Join(dir, fileName)
 		_, err := os.Stat(filePath)
 		if os.IsNotExist(err) {
 			return fileName
 		}
 		counter++
 		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
 	}
 }
 func ModelTTS(
 	backend,
 	text,
 	modelFile,
-	voice,
+	voice ,
 	language string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
@@ -50,7 +66,7 @@ func ModelTTS(
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
+	fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
 	filePath := filepath.Join(appConfig.AudioDir, fileName)
 	// If the model file is not empty, we pass it joined with the model path
@@ -72,15 +88,12 @@ func ModelTTS(
 	}
 	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
-		Text:     text,
+		Text:  text,
-		Model:    modelPath,
+		Model: modelPath,
-		Voice:    voice,
+		Voice: voice,
-		Dst:      filePath,
+		Dst:   filePath,
 		Language: &language,
 	})
 	if err != nil {
 		return "", nil, err
 	}
 	// return RPC error if any
 	if !res.Success {
--- a/core/cli/api/p2p.go
+++ b/core/cli/api/p2p.go
@@ -1,80 +0,0 @@
 package cli_api
 import (
 	"context"
 	"fmt"
 	"net"
 	"os"
 	"strings"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/edgevpn/pkg/node"
 	"github.com/rs/zerolog/log"
 )
 func StartP2PStack(ctx context.Context, address, token, networkID string, federated bool) error {
 	var n *node.Node
 	// Here we are avoiding creating multiple nodes:
 	// - if the federated mode is enabled, we create a federated node and expose a service
 	// - exposing a service creates a node with specific options, and we don't want to create another node
 	// If the federated mode is enabled, we expose a service to the local instance running
 	// at r.Address
 	if federated {
 		_, port, err := net.SplitHostPort(address)
 		if err != nil {
 			return err
 		}
 		// Here a new node is created and started
 		// and a service is exposed by the node
 		node, err := p2p.ExposeService(ctx, "localhost", port, token, p2p.NetworkID(networkID, p2p.FederatedID))
 		if err != nil {
 			return err
 		}
 		if err := p2p.ServiceDiscoverer(ctx, node, token, p2p.NetworkID(networkID, p2p.FederatedID), nil, false); err != nil {
 			return err
 		}
 		n = node
 	}
 	// If the p2p mode is enabled, we start the service discovery
 	if token != "" {
 		// If a node wasn't created previously, create it
 		if n == nil {
 			node, err := p2p.NewNode(token)
 			if err != nil {
 				return err
 			}
 			err = node.Start(ctx)
 			if err != nil {
 				return fmt.Errorf("starting new node: %w", err)
 			}
 			n = node
 		}
 		// Attach a ServiceDiscoverer to the p2p node
 		log.Info().Msg("Starting P2P server discovery...")
 		if err := p2p.ServiceDiscoverer(ctx, n, token, p2p.NetworkID(networkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
 			var tunnelAddresses []string
 			for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(networkID, p2p.WorkerID)) {
 				if v.IsOnline() {
 					tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
 				} else {
 					log.Info().Msgf("Node %s is offline", v.ID)
 				}
 			}
 			tunnelEnvVar := strings.Join(tunnelAddresses, ",")
 			os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
 			log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
 		}, true); err != nil {
 			return err
 		}
 	}
 	return nil
 }
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -8,13 +8,12 @@ import (
 var CLI struct {
 	cliContext.Context `embed:""`
-	Run             RunCMD             `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
-	Federated       FederatedCLI       `cmd:"" help:"Run LocalAI in federated mode"`
+	Federated  FederatedCLI  `cmd:"" help:"Run LocalAI in federated mode"`
-	Models          ModelsCMD          `cmd:"" help:"Manage LocalAI models and definitions"`
+	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
-	TTS             TTSCMD             `cmd:"" help:"Convert text to speech"`
+	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
-	SoundGeneration SoundGenerationCMD `cmd:"" help:"Generates audio files from text or audio"`
+	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
-	Transcript      TranscriptCMD      `cmd:"" help:"Convert audio to text"`
+	Worker     worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
-	Worker          worker.Worker      `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
+	Util       UtilCMD       `cmd:"" help:"Utility commands"`
-	Util            UtilCMD            `cmd:"" help:"Utility commands"`
+	Explorer   ExplorerCMD   `cmd:"" help:"Run p2p explorer"`
 	Explorer        ExplorerCMD        `cmd:"" help:"Run p2p explorer"`
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -3,10 +3,11 @@ package cli
 import (
 	"context"
 	"fmt"
 	"net"
 	"os"
 	"strings"
 	"time"
 	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
@@ -52,8 +53,6 @@ type RunCMD struct {
 	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
 	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
 	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
 	Peer2PeerDHTInterval   int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
 	Peer2PeerOTPInterval   int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
 	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
 	Peer2PeerNetworkID     string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
 	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
@@ -108,7 +107,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			// IF no token is provided, and p2p is enabled,
 			// we generate one and wait for the user to pick up the token (this is for interactive)
 			log.Info().Msg("No token provided, generating one")
-			token = p2p.GenerateToken(r.Peer2PeerDHTInterval, r.Peer2PeerOTPInterval)
+			token = p2p.GenerateToken()
 			log.Info().Msg("Generated Token:")
 			fmt.Println(token)
@@ -116,12 +115,52 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
 		}
 		opts = append(opts, config.WithP2PToken(token))
 		node, err := p2p.NewNode(token)
 		if err != nil {
 			return err
 		}
 		nodeContext := context.Background()
 		err = node.Start(nodeContext)
 		if err != nil {
 			return fmt.Errorf("starting new node: %w", err)
 		}
 		log.Info().Msg("Starting P2P server discovery...")
 		if err := p2p.ServiceDiscoverer(nodeContext, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
 			var tunnelAddresses []string
 			for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID)) {
 				if v.IsOnline() {
 					tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
 				} else {
 					log.Info().Msgf("Node %s is offline", v.ID)
 				}
 			}
 			tunnelEnvVar := strings.Join(tunnelAddresses, ",")
 			os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
 			log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
 		}, true); err != nil {
 			return err
 		}
 	}
-	backgroundCtx := context.Background()
+	if r.Federated {
 		_, port, err := net.SplitHostPort(r.Address)
 		if err != nil {
 			return err
 		}
 		fedCtx := context.Background()
-	if err := cli_api.StartP2PStack(backgroundCtx, r.Address, token, r.Peer2PeerNetworkID, r.Federated); err != nil {
+		node, err := p2p.ExposeService(fedCtx, "localhost", port, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID))
-		return err
+		if err != nil {
 			return err
 		}
 		if err := p2p.ServiceDiscoverer(fedCtx, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID), nil, false); err != nil {
 			return err
 		}
 	}
 	idleWatchDog := r.EnableWatchdogIdle
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -1,110 +0,0 @@
 package cli
 import (
 	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"github.com/mudler/LocalAI/core/backend"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 type SoundGenerationCMD struct {
 	Text []string `arg:""`
 	Backend                string   `short:"b" required:"" help:"Backend to run the SoundGeneration model"`
 	Model                  string   `short:"m" required:"" help:"Model name to run the SoundGeneration"`
 	Duration               string   `short:"d" help:"If specified, the length of audio to generate in seconds"`
 	Temperature            string   `short:"t" help:"If specified, the temperature of the generation"`
 	InputFile              string   `short:"i" help:"If specified, the input file to condition generation upon"`
 	InputFileSampleDivisor string   `short:"f" help:"If InputFile and this divisor is specified, the first portion of the sample file will be used"`
 	DoSample               bool     `short:"s" default:"true" help:"Enables sampling from the model. Better quality at the cost of speed. Defaults to enabled."`
 	OutputFile             string   `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath             string   `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath      string   `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
 	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
 }
 func parseToFloat32Ptr(input string) *float32 {
 	f, err := strconv.ParseFloat(input, 32)
 	if err != nil {
 		return nil
 	}
 	f2 := float32(f)
 	return &f2
 }
 func parseToInt32Ptr(input string) *int32 {
 	i, err := strconv.ParseInt(input, 10, 32)
 	if err != nil {
 		return nil
 	}
 	i2 := int32(i)
 	return &i2
 }
 func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	outputFile := t.OutputFile
 	outputDir := t.BackendAssetsPath
 	if outputFile != "" {
 		outputDir = filepath.Dir(outputFile)
 	}
 	text := strings.Join(t.Text, " ")
 	externalBackends := make(map[string]string)
 	// split ":" to get backend name and the uri
 	for _, v := range t.ExternalGRPCBackends {
 		backend := v[:strings.IndexByte(v, ':')]
 		uri := v[strings.IndexByte(v, ':')+1:]
 		externalBackends[backend] = uri
 		fmt.Printf("TMP externalBackends[%q]=%q\n\n", backend, uri)
 	}
 	opts := &config.ApplicationConfig{
 		ModelPath:            t.ModelsPath,
 		Context:              context.Background(),
 		AudioDir:             outputDir,
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
 	ml := model.NewModelLoader(opts.ModelPath)
 	defer func() {
 		err := ml.StopAllGRPC()
 		if err != nil {
 			log.Error().Err(err).Msg("unable to stop all grpc processes")
 		}
 	}()
 	options := config.BackendConfig{}
 	options.SetDefaults()
 	var inputFile *string
 	if t.InputFile != "" {
 		inputFile = &t.InputFile
 	}
 	filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
 		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
 		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
 	if err != nil {
 		return err
 	}
 	if outputFile != "" {
 		if err := os.Rename(filePath, outputFile); err != nil {
 			return err
 		}
 		fmt.Printf("Generate file %s\n", outputFile)
 	} else {
 		fmt.Printf("Generate file %s\n", filePath)
 	}
 	return nil
 }
--- a/core/cli/worker/worker.go
+++ b/core/cli/worker/worker.go
@@ -2,7 +2,6 @@ package worker
 type WorkerFlags struct {
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
 	ExtraLLamaCPPArgs string `name:"llama-cpp-args" env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
 }
 type Worker struct {
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -3,7 +3,6 @@ package worker
 import (
 	"fmt"
 	"os"
 	"strings"
 	"syscall"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
@@ -13,6 +12,7 @@ import (
 )
 type LLamaCPP struct {
 	Args        []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
 	WorkerFlags `embed:""`
 }
@@ -34,8 +34,9 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
 		"llama-cpp-rpc-server",
 	)
-	args := strings.Split(r.ExtraLLamaCPPArgs, " ")
+	args := os.Args[4:]
 	args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
 	args = append([]string{grpcProcess}, args...)
 	return syscall.Exec(
 		grpcProcess,
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
 	"strings"
 	"time"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
@@ -21,11 +20,12 @@ import (
 type P2P struct {
 	WorkerFlags        `embed:""`
-	Token              string `env:"LOCALAI_TOKEN,LOCALAI_P2P_TOKEN,TOKEN" help:"P2P token to use"`
+	Token              string   `env:"LOCALAI_TOKEN,LOCALAI_P2P_TOKEN,TOKEN" help:"P2P token to use"`
-	NoRunner           bool   `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
+	NoRunner           bool     `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
-	RunnerAddress      string `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
+	RunnerAddress      string   `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
-	RunnerPort         string `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
+	RunnerPort         string   `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
-	Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
+	ExtraLLamaCPPArgs  []string `env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
 	Peer2PeerNetworkID string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
 }
 func (r *P2P) Run(ctx *cliContext.Context) error {
@@ -76,8 +76,8 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 					"util",
 					"llama-cpp-rpc-server",
 				)
-				extraArgs := strings.Split(r.ExtraLLamaCPPArgs, " ")
+
-				args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, extraArgs...)
+				args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, r.ExtraLLamaCPPArgs...)
 				args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
 				cmd := exec.Command(
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -126,7 +126,6 @@ type LLMConfig struct {
 	Grammar         string   `yaml:"grammar"`
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
 	ExtractRegex    []string `yaml:"extract_regex"`
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -772,17 +772,6 @@ var _ = Describe("API test", func() {
 			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error:"))
 		})
 		It("shows the external backend", func() {
 			// do an http request to the /system endpoint
 			resp, err := http.Get("http://127.0.0.1:9090/system")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(resp.StatusCode).To(Equal(200))
 			dat, err := io.ReadAll(resp.Body)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(dat)).To(ContainSubstring("huggingface"))
 			Expect(string(dat)).To(ContainSubstring("llama-cpp"))
 		})
 		It("transcribes audio", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test supported only on linux")
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -1,65 +0,0 @@
 package elevenlabs
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 // SoundGenerationEndpoint is the ElevenLabs SoundGeneration endpoint https://elevenlabs.io/docs/api-reference/sound-generation
 // @Summary Generates audio from the input text.
 // @Param request body schema.ElevenLabsSoundGenerationRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/sound-generation [post]
 func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.ElevenLabsSoundGenerationRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
 		if err != nil {
 			modelFile = input.ModelID
 			log.Warn().Str("ModelID", input.ModelID).Msg("Model not found in context")
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.ModelID
 			log.Warn().Str("Request ModelID", input.ModelID).Err(err).Msg("error during LoadBackendConfigFileByName, using request ModelID")
 		} else {
 			if input.ModelID != "" {
 				modelFile = input.ModelID
 			} else {
 				modelFile = cfg.Model
 			}
 		}
 		log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
 		if input.Duration != nil {
 			log.Debug().Float32("duration", *input.Duration).Msg("duration set")
 		}
 		if input.Temperature != nil {
 			log.Debug().Float32("temperature", *input.Temperature).Msg("temperature set")
 		}
 		// TODO: Support uploading files?
 		filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 		return c.Download(filePath)
 	}
 }
--- a/core/http/endpoints/localai/system.go
+++ b/core/http/endpoints/localai/system.go
@@ -1,29 +0,0 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // SystemInformations returns the system informations
 // @Summary Show the LocalAI instance information
 // @Success 200 {object} schema.SystemInformationResponse "Response"
 // @Router /system [get]
 func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(*fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		availableBackends, err := ml.ListAvailableBackends(appConfig.AssetsDestination)
 		if err != nil {
 			return err
 		}
 		for b := range appConfig.ExternalGRPCBackends {
 			availableBackends = append(availableBackends, b)
 		}
 		return c.JSON(
 			schema.SystemInformationResponse{
 				Backends: availableBackends,
 			},
 		)
 	}
 }
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -25,8 +25,9 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
 func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	var id, textContentToReturn string
+	textContentToReturn := ""
-	var created int
+	id := uuid.New().String()
 	created := int(time.Now().Unix())
 	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
@@ -68,9 +69,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
 		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
-		functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
+		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
 		log.Debug().Msgf("Text content to return: %s", textContentToReturn)
-		noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0
+		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0
 		switch {
 		case noActionToRun:
@@ -83,7 +84,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			}
 			responses <- initialMessage
-			result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt)
+			result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt)
 			if err != nil {
 				log.Error().Err(err).Msg("error handling question")
 				return
@@ -105,7 +106,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			responses <- resp
 		default:
-			for i, ss := range functionResults {
+			for i, ss := range results {
 				name, args := ss.Name, ss.Arguments
 				initialMessage := schema.OpenAIResponse{
@@ -158,10 +159,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 	}
 	return func(c *fiber.Ctx) error {
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -16,6 +16,4 @@ func RegisterElevenLabsRoutes(app *fiber.App,
 	// Elevenlabs
 	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
 	app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -70,6 +70,4 @@ func RegisterLocalAIRoutes(app *fiber.App,
 		}{Version: internal.PrintableVersion()})
 	})
 	app.Get("/system", auth, localai.SystemInformations(ml, appConfig))
 }
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmEU9fBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmEU9fBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmSU5fBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmSU5fBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmWUlfBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmWUlfBBc9.ttf
--- a/core/http/static/assets/KFOlCnqEu92Fr1MmYUtfBBc9.ttf
+++ b/core/http/static/assets/KFOlCnqEu92Fr1MmYUtfBBc9.ttf
--- a/core/http/static/assets/KFOmCnqEu92Fr1Me5Q.ttf
+++ b/core/http/static/assets/KFOmCnqEu92Fr1Me5Q.ttf
--- a/core/http/static/assets/KFOmCnqEu92Fr1Mu4mxP.ttf
+++ b/core/http/static/assets/KFOmCnqEu92Fr1Mu4mxP.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf
--- a/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf
+++ b/core/http/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf
--- a/core/http/static/assets/alpine.js
+++ b/core/http/static/assets/alpine.js
--- a/core/http/static/assets/highlightjs.css
+++ b/core/http/static/assets/highlightjs.css
@@ -1,9 +0,0 @@
 /*!
  Theme: Default
  Description: Original highlight.js style
  Author: (c) Ivan Sagalaev <maniac@softwaremaniacs.org>
  Maintainer: @highlightjs/core-team
  Website: https://highlightjs.org/
  License: see project LICENSE
  Touched: 2021
 */pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}.hljs{background:#f3f3f3;color:#444}.hljs-comment{color:#697070}.hljs-punctuation,.hljs-tag{color:#444a}.hljs-tag .hljs-attr,.hljs-tag .hljs-name{color:#444}.hljs-attribute,.hljs-doctag,.hljs-keyword,.hljs-meta .hljs-keyword,.hljs-name,.hljs-selector-tag{font-weight:700}.hljs-deletion,.hljs-number,.hljs-quote,.hljs-selector-class,.hljs-selector-id,.hljs-string,.hljs-template-tag,.hljs-type{color:#800}.hljs-section,.hljs-title{color:#800;font-weight:700}.hljs-link,.hljs-operator,.hljs-regexp,.hljs-selector-attr,.hljs-selector-pseudo,.hljs-symbol,.hljs-template-variable,.hljs-variable{color:#ab5656}.hljs-literal{color:#695}.hljs-addition,.hljs-built_in,.hljs-bullet,.hljs-code{color:#397300}.hljs-meta{color:#1f7199}.hljs-meta .hljs-string{color:#38a}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}
--- a/core/http/static/assets/highlightjs.js
+++ b/core/http/static/assets/highlightjs.js
--- a/core/http/static/assets/htmx.js
+++ b/core/http/static/assets/htmx.js
--- a/core/http/static/assets/marked.js
+++ b/core/http/static/assets/marked.js
--- a/core/http/static/assets/purify.js
+++ b/core/http/static/assets/purify.js
--- a/core/http/static/assets/tailwindcss.js
+++ b/core/http/static/assets/tailwindcss.js
--- a/core/http/static/assets/tw-elements.css
+++ b/core/http/static/assets/tw-elements.css
--- a/core/http/static/assets/tw-elements.js
+++ b/core/http/static/assets/tw-elements.js
--- a/core/p2p/p2p.go
+++ b/core/p2p/p2p.go
@@ -28,15 +28,9 @@ import (
 	"github.com/mudler/edgevpn/pkg/logger"
 )
-func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectionConfig {
+func generateNewConnectionData() *node.YAMLConnectionConfig {
 	maxMessSize := 20 << 20 // 20MB
 	keyLength := 43
 	if DHTInterval == 0 {
 		DHTInterval = 360
 	}
 	if OTPInterval == 0 {
 		OTPInterval = 9000
 	}
 	return &node.YAMLConnectionConfig{
 		MaxMessageSize: maxMessSize,
@@ -46,21 +40,21 @@ func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectio
 		OTP: node.OTP{
 			DHT: node.OTPConfig{
 				Key:      eutils.RandStringRunes(keyLength),
-				Interval: DHTInterval,
+				Interval: 120,
 				Length:   keyLength,
 			},
 			Crypto: node.OTPConfig{
 				Key:      eutils.RandStringRunes(keyLength),
-				Interval: OTPInterval,
+				Interval: 9000,
 				Length:   keyLength,
 			},
 		},
 	}
 }
-func GenerateToken(DHTInterval, OTPInterval int) string {
+func GenerateToken() string {
 	// Generates a new config and exit
-	return generateNewConnectionData(DHTInterval, OTPInterval).Base64()
+	return generateNewConnectionData().Base64()
 }
 func IsP2PEnabled() bool {
--- a/core/p2p/p2p_disabled.go
+++ b/core/p2p/p2p_disabled.go
@@ -10,7 +10,7 @@ import (
 	"github.com/mudler/edgevpn/pkg/node"
 )
-func GenerateToken(DHTInterval, OTPInterval int) string {
+func GenerateToken() string {
 	return "not implemented"
 }
--- a/core/schema/elevenlabs.go
+++ b/core/schema/elevenlabs.go
@@ -4,11 +4,3 @@ type ElevenLabsTTSRequest struct {
 	Text    string `json:"text" yaml:"text"`
 	ModelID string `json:"model_id" yaml:"model_id"`
 }
 type ElevenLabsSoundGenerationRequest struct {
 	Text        string   `json:"text" yaml:"text"`
 	ModelID     string   `json:"model_id" yaml:"model_id"`
 	Duration    *float32 `json:"duration_seconds,omitempty" yaml:"duration_seconds,omitempty"`
 	Temperature *float32 `json:"prompt_influence,omitempty" yaml:"prompt_influence,omitempty"`
 	DoSample    *bool    `json:"do_sample,omitempty" yaml:"do_sample,omitempty"`
 }
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -70,7 +70,3 @@ type P2PNodesResponse struct {
 	Nodes          []p2p.NodeData `json:"nodes" yaml:"nodes"`
 	FederatedNodes []p2p.NodeData `json:"federated_nodes" yaml:"federated_nodes"`
 }
 type SystemInformationResponse struct {
 	Backends []string `json:"backends"`
 }
--- a/core/services/backend_monitor.go
+++ b/core/services/backend_monitor.go
@@ -107,7 +107,7 @@ func (bms BackendMonitorService) CheckAndSample(modelName string) (*proto.Status
 		return nil, err
 	}
 	modelAddr := bms.modelLoader.CheckIsLoaded(backendId)
-	if modelAddr == nil {
+	if modelAddr == "" {
 		return nil, fmt.Errorf("backend %s is not currently loaded", backendId)
 	}
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -133,10 +133,6 @@ Due to the nature of ROCm it is best to run all implementations in containers as
 Ongoing verification testing of ROCm compatability with integrated backends.
 Please note the following list of verified backends and devices.
 LocalAI hipblas images are built against the following targets: gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 If your device is not one of these you must specify the corresponding `GPU_TARGETS` and specify `REBUILD=true`. Otherwise you don't need to specify these in the commands below.
 ### Verified 
 The devices in the following list have been tested with `hipblas` images running `ROCm 6.0.0`
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ettore Di Giacinto	2a03905920	Merge branch 'master' into cleanup_deps	2024-08-21 13:10:46 +02:00
Ettore Di Giacinto	35297ebc14	Drop also ttf files Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-08-21 13:03:26 +02:00
Ettore Di Giacinto	b303805df9	fix marked	2024-08-21 13:02:19 +02:00
Ettore Di Giacinto	32d51797d9	fix alpine.js	2024-08-21 13:02:19 +02:00
Ettore Di Giacinto	af09b019ed	fix(assets): generate assets on build time Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-08-21 13:02:19 +02:00
`@@ -1,2 +1,2 @@`
	`grpcio==1.66.1`	`grpcio==1.65.5`
	`protobuf`	`protobuf`