fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
feat(vllm): add support for image-to-text and video-to-text (#3729 )
2026-05-23 08:10:48 -04:00 · 2024-10-05 00:07:58 +02:00 · 2024-10-04 23:42:05 +02:00 · 2024-10-04 19:52:43 +02:00 · 2024-10-04 18:32:29 +02:00 · 2024-10-04 09:10:07 +02:00
132 changed files with 2836 additions and 929 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -9,6 +9,7 @@
 # Param 2: email
 #
 config_user() {
    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
@@ -24,6 +25,7 @@ config_user() {
 # Param 2: remote url
 #
 config_remote() {
    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    if scan['hasUnsafeFile']:
+    # Check if 'hasUnsafeFile' exists in the response
-        return scan
+    if 'hasUnsafeFile' in scan:
-    return None
+        if scan['hasUnsafeFile']:
            return scan
        else:
            return None
    else:
        return None
 download_type, repo_id_or_url = parse_uri(uri)
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,6 +13,78 @@ concurrency:
  cancel-in-progress: true
 jobs:
  hipblas-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: 2
      matrix:
        include:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -39,7 +111,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
@@ -122,29 +194,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -212,26 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -178,13 +178,22 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          # Install protoc
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
 ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/13
+++ b/13
@@ -297,10 +297,10 @@ COPY .git .
 RUN make prepare
 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas build
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-## (both will use CUDA for the actual computation)
+## (both will use CUDA or hipblas for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@@ -338,9 +338,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ssh less && \
+        ssh less wget
-    apt-get clean && \
+# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
    rm -rf /var/lib/apt/lists/*
 RUN go install github.com/go-delve/delve/cmd/dlv@latest
--- a/9
+++ b/9
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -359,6 +359,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -465,7 +468,7 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
-run-e2e-aio:
+run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
--- a/README.md
+++ b/README.md
@@ -68,9 +68,7 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 [💻 Getting started](https://localai.io/basics/getting_started/index.html)
-## 🔥🔥 Hot topics / Roadmap
+## 📰 Latest project news
 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
@@ -83,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
-Hot topics (looking for contributors):
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -26,6 +26,19 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
 }
 // Define the empty request
 message MetricsRequest {}
 message MetricsResponse {
  int32 slot_id = 1;
  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
  float tokens_per_second = 3;
  int32 tokens_generated = 4;
  int32 prompt_tokens_processed = 5;
 }
 message RerankRequest {
@@ -134,6 +147,9 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
  repeated string Videos = 45;
  repeated string Audios = 46;
  string CorrelationId = 47;
 }
 // The response message containing the result
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,6 +13,7 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
 #include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
@@ -448,7 +449,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }
@@ -462,7 +463,7 @@ struct llama_server_context
        ctx = llama_init.context;
        if (model == nullptr)
        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }
@@ -470,7 +471,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -489,11 +490,21 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
            if (slot.is_processing()) {
                return &slot;  // Return the active slot
            }
        }
        return nullptr;  // No active slot found
    }
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -812,10 +823,11 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERROR("failed to load image", {
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
-                            {"slot_id",   slot->id},
+                             __func__,
-                            {"img_sl_id", img_sl.id}
+                             slot->id,
-                        });
+                             img_sl.id
                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -853,12 +865,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -886,7 +898,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });
-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
        return true;
    }
@@ -926,7 +938,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -938,7 +950,7 @@ struct llama_server_context
            }
        }
-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
        system_need_update = false;
    }
@@ -1120,7 +1132,7 @@ struct llama_server_context
            }
            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                return false;
            }
@@ -1132,7 +1144,7 @@ struct llama_server_context
    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1371,7 +1383,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1389,7 +1401,7 @@ struct llama_server_context
                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1572,7 +1584,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                    continue;
                    // END LOCALAI changes
@@ -1820,10 +1832,11 @@ struct llama_server_context
                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERROR("failed processing images", {
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
-                            "slot_id", slot.id,
+                            __func__,
-                            "task_id", slot.task_id,
+                            slot.id,
-                        });
+                            slot.task_id
                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1863,10 +1876,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
-                        LOG_TEE("\n");
+                        LOG("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1876,7 +1889,7 @@ struct llama_server_context
                        slot.ga_i += slot.ga_w / slot.ga_n;
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1901,11 +1914,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -2103,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();
    // Add the correlationid to json data
    data["correlation_id"] = predict->correlationid();
    // for each image in the request, add the image data
    //
    for (int i = 0; i < predict->images_size(); i++) {
@@ -2341,6 +2357,11 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);
                // Log Request Correlation Id
                LOG_VERBOSE("correlation:", {
                    { "id", data["correlation_id"] }
                });
                // Send the reply
                writer->Write(reply);
@@ -2364,6 +2385,12 @@ public:
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            // Log Request Correlation Id
            LOG_VERBOSE("correlation:", {
                { "id", data["correlation_id"] }
            });
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2403,6 +2430,31 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
        if (active_slot != nullptr) {
            // Calculate the tokens per second using existing logic
            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
            // Populate the response with metrics
            response->set_slot_id(active_slot->id);
            response->set_prompt_json_for_slot(active_slot->prompt.dump());
            response->set_tokens_per_second(tokens_per_second);
            response->set_tokens_generated(active_slot->n_decoded);
            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
        } else {
            // Handle case when no active slot exists
            response->set_slot_id(0);
            response->set_prompt_json_for_slot("");
            response->set_tokens_per_second(0);
            response->set_tokens_generated(0);
            response->set_prompt_tokens_processed(0);
        }
        return grpc::Status::OK;
    } 
 };
 void RunServer(const std::string& server_address) {
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-TTS==0.22.0
+coqui-tts
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.66.2
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
@@ -18,6 +18,6 @@ python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==4.38.1
+gradio==4.44.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,5 +15,12 @@ installRequirements
 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=$(ls ${MY_DIR}/venv/lib)
+PYDIR=python3.10
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
+pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
 if [ ! -d ${pyenv} ]; then
    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
    exit 1
 fi
 curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/sentencetransformers/backend.py
+++ b/backend/python/sentencetransformers/backend.py
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model
        try:
-            self.model = SentenceTransformer(model_name)
+            self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
--- a/backend/python/sentencetransformers/requirements-cpu.txt
+++ b/backend/python/sentencetransformers/requirements-cpu.txt
@@ -2,5 +2,5 @@ torch
 accelerate
 transformers
 bitsandbytes
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas11.txt
+++ b/backend/python/sentencetransformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas12.txt
+++ b/backend/python/sentencetransformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -4,5 +4,5 @@ torch
 optimum[openvino]
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,5 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
-certifi
+certifi
 datasets
 einops
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -4,4 +4,4 @@ accelerate
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@ import argparse
 import signal
 import sys
 import os
 from typing import List
 from PIL import Image
 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
 from vllm.assets.video import VideoAsset
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        try:
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    async def Predict(self, request, context):
@@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        # Extract image paths and process images
        prompt = request.Prompt
-        
+
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        image_paths = request.Images
        image_data = [self.load_image(img_path) for img_path in image_paths]
        videos_path = request.Videos
        video_data = [self.load_video(video_path) for video_path in videos_path]
        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-        # Generate text
+        # Generate text using the LLM engine
        request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
        outputs = self.llm.generate(
            {
                "prompt": prompt,
                "multi_modal_data": {
                    "image": image_data if image_data else None,
                    "video": video_data if video_data else None,
                } if image_data or video_data else None,
            },
            sampling_params=sampling_params,
            request_id=request_id,
        )
        # Stream the results
        generated_text = ""
@@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if streaming:
            return
        # Remove the image files from /tmp folder
        for img_path in image_paths:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        # Sending the final generated text
        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
    def load_image(self, image_path: str):
        """
        Load an image from the given file path.
        Args:
            image_path (str): The path to the image file.
        Returns:
            Image: The loaded image.
        """
        try:
            return Image.open(image_path)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
            return self.load_video(image_path)
    def load_video(self, video_path: str):
        """
        Load a video from the given file path.
        Args:
            video_path (str): The path to the image file.
        Returns:
            Video: The loaded video.
        """
        try:
            video = VideoAsset(name=video_path).np_ndarrays
            return video
        except Exception as e:
            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
            return None
 async def serve(address):
    # Start asyncio gRPC server
    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
        ensureVenv
        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
        if [ ! -d vllm ]; then
            git clone https://github.com/vllm-project/vllm
        fi
        pushd vllm
            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
            VLLM_TARGET_DEVICE=cpu python setup.py install
        popd
        rm -rf vllm
    else
        installRequirements
 fi
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,4 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 bitsandbytes
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -10,20 +10,11 @@ import (
 )
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model
 	grpcOpts := gRPCModelOpts(backendConfig)
 	var inferenceModel interface{}
 	var err error
-	opts := modelOpts(backendConfig, appConfig, []model.Option{
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*backendConfig.Threads)),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 	})
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -8,19 +8,8 @@ import (
 )
 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-	threads := backendConfig.Threads
+
-	if *threads == 0 && appConfig.Threads != 0 {
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		threads = &appConfig.Threads
 	}
 	gRPCOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithThreads(uint32(*threads)),
 		model.WithContext(appConfig.Context),
 		model.WithModel(backendConfig.Model),
 		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
 	})
 	inferenceModel, err := loader.BackendLoader(
 		opts...,
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -31,24 +31,13 @@ type TokenUsage struct {
 	Completion int
 }
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
 		threads = &o.Threads
 	}
 	grpcOpts := gRPCModelOpts(c)
 	var inferenceModel grpc.Backend
 	var err error
-	opts := modelOpts(c, o, []model.Option{
+	opts := ModelOptions(c, o, []model.Option{})
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 	})
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -101,6 +90,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
 		opts.Videos = videos
 		opts.Audios = audios
 		tokenUsage := TokenUsage{}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -11,32 +11,65 @@ import (
 	"github.com/rs/zerolog/log"
 )
-func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	name := c.Name
 	if name == "" {
 		name = c.Model
 	}
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithModel(c.Model),
 		model.WithAssetDir(so.AssetsDestination),
 		model.WithContext(so.Context),
 		model.WithModelID(name),
 	}
 	threads := 1
 	if c.Threads != nil {
 		threads = *c.Threads
 	}
 	if so.Threads != 0 {
 		threads = so.Threads
 	}
 	c.Threads = &threads
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
 	if so.SingleBackend {
-		opts = append(opts, model.WithSingleActiveBackend())
+		defOpts = append(defOpts, model.WithSingleActiveBackend())
 	}
 	if so.ParallelBackendRequests {
-		opts = append(opts, model.EnableParallelRequests)
+		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
 	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+		defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
 	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+		defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}
 	for k, v := range so.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
+		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}
-	return opts
+	return append(defOpts, opts...)
 }
 func getSeed(c config.BackendConfig) int32 {
-	seed := int32(*c.Seed)
+	var seed int32 = config.RAND_SEED
 	if c.Seed != nil {
 		seed = int32(*c.Seed)
 	}
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
 	}
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }
-func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
 	f16 := false
 	if c.F16 != nil {
 		f16 = *c.F16
 	}
 	embeddings := false
 	if c.Embeddings != nil {
 		embeddings = *c.Embeddings
 	}
 	lowVRAM := false
 	if c.LowVRAM != nil {
 		lowVRAM = *c.LowVRAM
 	}
 	mmap := false
 	if c.MMap != nil {
 		mmap = *c.MMap
 	}
 	ctxSize := 1024
 	if c.ContextSize != nil {
 		ctxSize = *c.ContextSize
 	}
 	mmlock := false
 	if c.MMlock != nil {
 		mmlock = *c.MMlock
 	}
 	nGPULayers := 9999999
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
@@ -56,14 +125,14 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CFGScale:             c.Diffusers.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
-		F16Memory:            *c.F16,
+		F16Memory:            f16,
 		LoraBase:             c.LoraBase,
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
-		ContextSize:          int32(*c.ContextSize),
+		ContextSize:          int32(ctxSize),
 		Seed:                 getSeed(c),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
@@ -85,16 +154,16 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnBetaSlow:         c.YarnBetaSlow,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		MLock:                *c.MMlock,
+		MLock:                mmlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
 		Type:                 c.ModelType,
 		RopeFreqScale:        c.RopeFreqScale,
 		NUMA:                 c.NUMA,
-		Embeddings:           *c.Embeddings,
+		Embeddings:           embeddings,
-		LowVRAM:              *c.LowVRAM,
+		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(*c.NGPULayers),
+		NGPULayers:           int32(nGPULayers),
-		MMap:                 *c.MMap,
+		MMap:                 mmap,
 		MainGPU:              c.MainGPU,
 		Threads:              int32(*c.Threads),
 		TensorSplit:          c.TensorSplit,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -9,21 +9,9 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
-func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
+func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	bb := backend
 	if bb == "" {
 		return nil, fmt.Errorf("backend is required")
 	}
-	grpcOpts := gRPCModelOpts(backendConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	rerankModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -13,7 +13,6 @@ import (
 )
 func SoundGeneration(
 	backend string,
 	modelFile string,
 	text string,
 	duration *float32,
@@ -25,18 +24,8 @@ func SoundGeneration(
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
 	if backend == "" {
 		return "", nil, fmt.Errorf("backend is a required parameter")
 	}
-	grpcOpts := gRPCModelOpts(backendConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(backend),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	soundGenModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -0,0 +1,33 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func TokenMetrics(
 	modelFile string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
 	opts := ModelOptions(backendConfig, appConfig, []model.Option{
 		model.WithModel(modelFile),
 	})
 	model, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
 	}
 	res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
 	return res, err
 }
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -0,0 +1,44 @@
 package backend
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 	modelFile := backendConfig.Model
 	var inferenceModel grpc.Backend
 	var err error
 	opts := ModelOptions(backendConfig, appConfig, []model.Option{
 		model.WithModel(modelFile),
 	})
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
 	// tokenize the string
 	resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	return schema.TokenizeResponse{
 		Tokens: resp.Tokens,
 	}, nil
 }
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -14,13 +14,11 @@ import (
 func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
-	opts := modelOpts(backendConfig, appConfig, []model.Option{
+	if backendConfig.Backend == "" {
-		model.WithBackendString(model.WhisperBackend),
+		backendConfig.Backend = model.WhisperBackend
-		model.WithModel(backendConfig.Model),
+	}
-		model.WithContext(appConfig.Context),
+
-		model.WithThreads(uint32(*backendConfig.Threads)),
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		model.WithAssetDir(appConfig.AssetsDestination),
 	})
 	transcriptionModel, err := ml.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -28,14 +28,9 @@ func ModelTTS(
 		bb = model.PiperBackend
 	}
-	grpcOpts := gRPCModelOpts(backendConfig)
+	opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	ttsModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -41,31 +41,35 @@ type RunCMD struct {
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
 	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
-	Address                string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
-	CORS                   bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
-	CORSAllowOrigins       string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	CORSAllowOrigins                   string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
-	LibraryPath            string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
+	LibraryPath                        string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
-	CSRF                   bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
+	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
-	UploadLimit            int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
-	APIKeys                []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI           bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
-	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
+	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
-	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
+	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
-	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
+	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
-	Peer2PeerDHTInterval   int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
+	DisableApiKeyRequirementForHttpGet bool     `env:"LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET" default:"false" help:"If true, a valid API key is not required to issue GET requests to portions of the web ui. This should only be enabled in secure testing environments" group:"hardening"`
-	Peer2PeerOTPInterval   int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
+	HttpGetExemptedEndpoints           []string `env:"LOCALAI_HTTP_GET_EXEMPTED_ENDPOINTS" default:"^/$,^/browse/?$,^/talk/?$,^/p2p/?$,^/chat/?$,^/text2image/?$,^/tts/?$,^/static/.*$,^/swagger.*$" help:"If LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET is overriden to true, this is the list of endpoints to exempt. Only adjust this in case of a security incident or as a result of a personal security posture review" group:"hardening"`
-	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+	Peer2Peer                          bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
-	Peer2PeerNetworkID     string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
+	Peer2PeerDHTInterval               int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
-	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
+	Peer2PeerOTPInterval               int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
-	SingleActiveBackend    bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	Peer2PeerToken                     string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
-	PreloadBackendOnly     bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
+	Peer2PeerNetworkID                 string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
-	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
+	ParallelRequests                   bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
-	EnableWatchdogIdle     bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
+	SingleActiveBackend                bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
-	WatchdogIdleTimeout    string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
+	PreloadBackendOnly                 bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
-	EnableWatchdogBusy     bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
+	ExternalGRPCBackends               []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
-	WatchdogBusyTimeout    string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	EnableWatchdogIdle                 bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
-	Federated              bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
+	WatchdogIdleTimeout                string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
-	DisableGalleryEndpoint bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
+	EnableWatchdogBusy                 bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
 	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
 	DisableGalleryEndpoint             bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
 	LoadToMemory                       []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
 }
 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -97,7 +101,11 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
 		config.WithOpaqueErrors(r.OpaqueErrors),
 		config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
 		config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
 		config.WithDisableApiKeyRequirementForHttpGet(r.DisableApiKeyRequirementForHttpGet),
 		config.WithHttpGetExemptedEndpoints(r.HttpGetExemptedEndpoints),
 		config.WithP2PNetworkID(r.Peer2PeerNetworkID),
 		config.WithLoadToMemory(r.LoadToMemory),
 	}
 	token := ""
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -85,13 +85,14 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 	options.Backend = t.Backend
 	var inputFile *string
 	if t.InputFile != "" {
 		inputFile = &t.InputFile
 	}
-	filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
+	filePath, _, err := backend.SoundGeneration(t.Model, text,
 		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
 		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -15,8 +15,9 @@ import (
 )
 type UtilCMD struct {
-	GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
+	GGUFInfo         GGUFInfoCMD         `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
-	HFScan   HFScanCMD   `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
+	HFScan           HFScanCMD           `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
 	UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
 }
 type GGUFInfoCMD struct {
@@ -30,6 +31,11 @@ type HFScanCMD struct {
 	ToScan     []string `arg:""`
 }
 type UsecaseHeuristicCMD struct {
 	ConfigName string `name:"The config file to check"`
 	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 }
 func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	if u.Args == nil || len(u.Args) == 0 {
 		return fmt.Errorf("no GGUF file provided")
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
 		return nil
 	}
 }
 func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
 	if len(uhcmd.ConfigName) == 0 {
 		log.Error().Msg("ConfigName is a required parameter")
 		return fmt.Errorf("config name is a required parameter")
 	}
 	if len(uhcmd.ModelsPath) == 0 {
 		log.Error().Msg("ModelsPath is a required parameter")
 		return fmt.Errorf("model path is a required parameter")
 	}
 	bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
 	err := bcl.LoadBackendConfig(uhcmd.ConfigName)
 	if err != nil {
 		log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
 		return err
 	}
 	bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
 	if !exists {
 		log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
 	}
 	for name, uc := range config.GetAllBackendConfigUsecases() {
 		if bc.HasUsecases(uc) {
 			log.Info().Str("Usecase", name)
 		}
 	}
 	log.Info().Msg("---")
 	return nil
 }
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
 	"regexp"
 	"time"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
@@ -16,7 +17,6 @@ type ApplicationConfig struct {
 	ModelPath                           string
 	LibPath                             string
 	UploadLimitMB, Threads, ContextSize int
 	DisableWebUI                        bool
 	F16                                 bool
 	Debug                               bool
 	ImageDir                            string
@@ -31,11 +31,18 @@ type ApplicationConfig struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
 	EnforcePredownloadScans             bool
 	OpaqueErrors                        bool
 	P2PToken                            string
 	P2PNetworkID                        string
 	DisableWebUI                       bool
 	EnforcePredownloadScans            bool
 	OpaqueErrors                       bool
 	UseSubtleKeyComparison             bool
 	DisableApiKeyRequirementForHttpGet bool
 	HttpGetExemptedEndpoints           []*regexp.Regexp
 	DisableGalleryEndpoint             bool
 	LoadToMemory                       []string
 	ModelLibraryURL string
 	Galleries []Gallery
@@ -57,8 +64,6 @@ type ApplicationConfig struct {
 	ModelsURL []string
 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
 	DisableGalleryEndpoint bool
 }
 type AppOption func(*ApplicationConfig)
@@ -327,6 +332,38 @@ func WithOpaqueErrors(opaque bool) AppOption {
 	}
 }
 func WithLoadToMemory(models []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.LoadToMemory = models
 	}
 }
 func WithSubtleKeyComparison(subtle bool) AppOption {
 	return func(o *ApplicationConfig) {
 		o.UseSubtleKeyComparison = subtle
 	}
 }
 func WithDisableApiKeyRequirementForHttpGet(required bool) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DisableApiKeyRequirementForHttpGet = required
 	}
 }
 func WithHttpGetExemptedEndpoints(endpoints []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.HttpGetExemptedEndpoints = []*regexp.Regexp{}
 		for _, epr := range endpoints {
 			r, err := regexp.Compile(epr)
 			if err == nil && r != nil {
 				o.HttpGetExemptedEndpoints = append(o.HttpGetExemptedEndpoints, r)
 			} else {
 				log.Warn().Err(err).Str("regex", epr).Msg("Error while compiling HTTP Get Exemption regex, skipping this entry.")
 			}
 		}
 	}
 }
 // ToConfigLoaderOptions returns a slice of ConfigLoader Option.
 // Some options defined at the application level are going to be passed as defaults for
 // all the configuration for the models.
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -3,11 +3,13 @@ package config
 import (
 	"os"
 	"regexp"
 	"slices"
 	"strings"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"gopkg.in/yaml.v3"
 )
 const (
@@ -27,13 +29,15 @@ type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
-	F16            *bool             `yaml:"f16"`
+	F16                 *bool                  `yaml:"f16"`
-	Threads        *int              `yaml:"threads"`
+	Threads             *int                   `yaml:"threads"`
-	Debug          *bool             `yaml:"debug"`
+	Debug               *bool                  `yaml:"debug"`
-	Roles          map[string]string `yaml:"roles"`
+	Roles               map[string]string      `yaml:"roles"`
-	Embeddings     *bool             `yaml:"embeddings"`
+	Embeddings          *bool                  `yaml:"embeddings"`
-	Backend        string            `yaml:"backend"`
+	Backend             string                 `yaml:"backend"`
-	TemplateConfig TemplateConfig    `yaml:"template"`
+	TemplateConfig      TemplateConfig         `yaml:"template"`
 	KnownUsecaseStrings []string               `yaml:"known_usecases"`
 	KnownUsecases       *BackendConfigUsecases `yaml:"-"`
 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -192,6 +196,21 @@ type TemplateConfig struct {
 	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
 	// It defaults to \n
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 	Video string `yaml:"video"`
 	Image string `yaml:"image"`
 	Audio string `yaml:"audio"`
 }
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
 	type BCAlias BackendConfig
 	var aux BCAlias
 	if err := value.Decode(&aux); err != nil {
 		return err
 	}
 	*c = BackendConfig(aux)
 	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
 	return nil
 }
 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -410,3 +429,121 @@ func (c *BackendConfig) Validate() bool {
 func (c *BackendConfig) HasTemplate() bool {
 	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
 }
 type BackendConfigUsecases int
 const (
 	FLAG_ANY              BackendConfigUsecases = 0b000000000
 	FLAG_CHAT             BackendConfigUsecases = 0b000000001
 	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
 	FLAG_EDIT             BackendConfigUsecases = 0b000000100
 	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
 	FLAG_RERANK           BackendConfigUsecases = 0b000010000
 	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
 	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
 	FLAG_TTS              BackendConfigUsecases = 0b010000000
 	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
 )
 func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 	return map[string]BackendConfigUsecases{
 		"FLAG_ANY":              FLAG_ANY,
 		"FLAG_CHAT":             FLAG_CHAT,
 		"FLAG_COMPLETION":       FLAG_COMPLETION,
 		"FLAG_EDIT":             FLAG_EDIT,
 		"FLAG_EMBEDDINGS":       FLAG_EMBEDDINGS,
 		"FLAG_RERANK":           FLAG_RERANK,
 		"FLAG_IMAGE":            FLAG_IMAGE,
 		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
 		"FLAG_TTS":              FLAG_TTS,
 		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
 		"FLAG_LLM":              FLAG_LLM,
 	}
 }
 func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	if len(input) == 0 {
 		return nil
 	}
 	result := FLAG_ANY
 	flags := GetAllBackendConfigUsecases()
 	for _, str := range input {
 		flag, exists := flags["FLAG_"+strings.ToUpper(str)]
 		if exists {
 			result |= flag
 		}
 	}
 	return &result
 }
 // HasUsecases examines a BackendConfig and determines which endpoints have a chance of success.
 func (c *BackendConfig) HasUsecases(u BackendConfigUsecases) bool {
 	if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
 		return true
 	}
 	return c.GuessUsecases(u)
 }
 // GuessUsecases is a **heuristic based** function, as the backend in question may not be loaded yet, and the config may not record what it's useful at.
 // In its current state, this function should ideally check for properties of the config like templates, rather than the direct backend name checks for the lower half.
 // This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
 func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 	if (u & FLAG_CHAT) == FLAG_CHAT {
 		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
 			return false
 		}
 	}
 	if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
 		if c.TemplateConfig.Completion == "" {
 			return false
 		}
 	}
 	if (u & FLAG_EDIT) == FLAG_EDIT {
 		if c.TemplateConfig.Edit == "" {
 			return false
 		}
 	}
 	if (u & FLAG_EMBEDDINGS) == FLAG_EMBEDDINGS {
 		if c.Embeddings == nil || !*c.Embeddings {
 			return false
 		}
 	}
 	if (u & FLAG_IMAGE) == FLAG_IMAGE {
 		imageBackends := []string{"diffusers", "tinydream", "stablediffusion"}
 		if !slices.Contains(imageBackends, c.Backend) {
 			return false
 		}
 		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
 			return false
 		}
 	}
 	if (u & FLAG_RERANK) == FLAG_RERANK {
 		if c.Backend != "rerankers" {
 			return false
 		}
 	}
 	if (u & FLAG_TRANSCRIPT) == FLAG_TRANSCRIPT {
 		if c.Backend != "whisper" {
 			return false
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
 		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
 	}
 	if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
 		if c.Backend != "transformers-musicgen" {
 			return false
 		}
 	}
 	return true
 }
--- a/core/config/backend_config_filter.go
+++ b/core/config/backend_config_filter.go
@@ -0,0 +1,35 @@
 package config
 import "regexp"
 type BackendConfigFilterFn func(string, *BackendConfig) bool
 func NoFilterFn(_ string, _ *BackendConfig) bool { return true }
 func BuildNameFilterFn(filter string) (BackendConfigFilterFn, error) {
 	if filter == "" {
 		return NoFilterFn, nil
 	}
 	rxp, err := regexp.Compile(filter)
 	if err != nil {
 		return nil, err
 	}
 	return func(name string, config *BackendConfig) bool {
 		if config != nil {
 			return rxp.MatchString(config.Name)
 		}
 		return rxp.MatchString(name)
 	}, nil
 }
 func BuildUsecaseFilterFn(usecases BackendConfigUsecases) BackendConfigFilterFn {
 	if usecases == FLAG_ANY {
 		return NoFilterFn
 	}
 	return func(name string, config *BackendConfig) bool {
 		if config == nil {
 			return false // TODO: Potentially make this a param, for now, no known usecase to include
 		}
 		return config.HasUsecases(usecases)
 	}
 }
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -201,6 +201,26 @@ func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
 	return res
 }
 func (bcl *BackendConfigLoader) GetBackendConfigsByFilter(filter BackendConfigFilterFn) []BackendConfig {
 	bcl.Lock()
 	defer bcl.Unlock()
 	var res []BackendConfig
 	if filter == nil {
 		filter = NoFilterFn
 	}
 	for n, v := range bcl.configs {
 		if filter(n, &v) {
 			res = append(res, v)
 		}
 	}
 	// TODO: I don't think this one needs to Sort on name... but we'll see what breaks.
 	return res
 }
 func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
 	bcl.Lock()
 	defer bcl.Unlock()
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -19,12 +19,17 @@ var _ = Describe("Test cases for config related functions", func() {
 				`backend: "../foo-bar"
 name: "foo"
 parameters:
-  model: "foo-bar"`)
+  model: "foo-bar"
 known_usecases:
 - chat
 - COMPLETION
 `)
 			Expect(err).ToNot(HaveOccurred())
 			config, err := readBackendConfigFromFile(tmp.Name())
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			Expect(config.Validate()).To(BeFalse())
 			Expect(config.KnownUsecases).ToNot(BeNil())
 		})
 		It("Test Validate", func() {
 			tmp, err := os.CreateTemp("", "config.yaml")
@@ -61,4 +66,99 @@ parameters:
 			Expect(config.Validate()).To(BeTrue())
 		})
 	})
 	It("Properly handles backend usecase matching", func() {
 		a := BackendConfig{
 			Name: "a",
 		}
 		Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
 		b := BackendConfig{
 			Name:    "b",
 			Backend: "stablediffusion",
 		}
 		Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
 		Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		c := BackendConfig{
 			Name:    "c",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Chat: "chat",
 			},
 		}
 		Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
 		Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
 		d := BackendConfig{
 			Name:    "d",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Chat:       "chat",
 				Completion: "completion",
 			},
 		}
 		Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
 		trueValue := true
 		e := BackendConfig{
 			Name:    "e",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Completion: "completion",
 			},
 			Embeddings: &trueValue,
 		}
 		Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
 		f := BackendConfig{
 			Name:    "f",
 			Backend: "piper",
 		}
 		Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
 		Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		g := BackendConfig{
 			Name:    "g",
 			Backend: "whisper",
 		}
 		Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
 		Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
 		h := BackendConfig{
 			Name:    "h",
 			Backend: "transformers-musicgen",
 		}
 		Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
 		Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
 		Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
 		knownUsecases := FLAG_CHAT | FLAG_COMPLETION
 		i := BackendConfig{
 			Name:    "i",
 			Backend: "whisper",
 			// Earlier test checks parsing, this just needs to set final values
 			KnownUsecases: &knownUsecases,
 		}
 		Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
 		Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
 	})
 })
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -132,7 +132,7 @@ func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*Gal
 func findGalleryURLFromReferenceURL(url string, basePath string) (string, error) {
 	var refFile string
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		refFile = string(d)
 		if len(refFile) == 0 {
 			return fmt.Errorf("invalid reference file at url %s: %s", url, d)
@@ -156,7 +156,7 @@ func getGalleryModels(gallery config.Gallery, basePath string) ([]*GalleryModel,
 	}
 	uri := downloader.URI(gallery.URL)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &models)
 	})
 	if err != nil {
--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -69,7 +69,7 @@ type PromptTemplate struct {
 func GetGalleryConfigFromURL(url string, basePath string) (Config, error) {
 	var config Config
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &config)
 	})
 	if err != nil {
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -3,13 +3,15 @@ package http
 import (
 	"embed"
 	"errors"
 	"fmt"
 	"net/http"
 	"strings"
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
 	"github.com/mudler/LocalAI/core/config"
@@ -29,24 +31,6 @@ import (
 	"github.com/rs/zerolog/log"
 )
 func readAuthHeader(c *fiber.Ctx) string {
 	authHeader := c.Get("Authorization")
 	// elevenlabs
 	xApiKey := c.Get("xi-api-key")
 	if xApiKey != "" {
 		authHeader = "Bearer " + xApiKey
 	}
 	// anthropic
 	xApiKey = c.Get("x-api-key")
 	if xApiKey != "" {
 		authHeader = "Bearer " + xApiKey
 	}
 	return authHeader
 }
 // Embed a directory
 //
 //go:embed static/*
@@ -137,37 +121,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		})
 	}
-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+ // Health Checks should always be exempt from auth, so register these first
-	auth := func(c *fiber.Ctx) error {
+	routes.HealthRoutes(app)
 		if len(appConfig.ApiKeys) == 0 {
 			return c.Next()
 		}
-		if len(appConfig.ApiKeys) == 0 {
+	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
-			return c.Next()
+	if err != nil || kaConfig == nil {
-		}
+		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 		authHeader := readAuthHeader(c)
 		if authHeader == "" {
 			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
 		}
 		// If it's a bearer token
 		authHeaderParts := strings.Split(authHeader, " ")
 		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
 			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
 		}
 		apiKey := authHeaderParts[1]
 		for _, key := range appConfig.ApiKeys {
 			if apiKey == key {
 				return c.Next()
 			}
 		}
 		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
 	}
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
 	app.Use(v2keyauth.New(*kaConfig))
 	if appConfig.CORS {
 		var c func(ctx *fiber.Ctx) error
 		if appConfig.CORSAllowOrigins == "" {
@@ -192,13 +156,13 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
+	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
 	if !appConfig.DisableWebUI {
-		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
+		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterJINARoutes(app, cl, ml, appConfig)
 	httpFS := http.FS(embedDirStatic)
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -31,6 +31,9 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )
 const apiKey = "joshua"
 const bearerKey = "Bearer " + apiKey
 const testPrompt = `### System:
 You are an AI assistant that follows instruction extremely well. Help as much as you can.
@@ -50,11 +53,19 @@ type modelApplyRequest struct {
 func getModelStatus(url string) (response map[string]interface{}) {
 	// Create the HTTP request
-	resp, err := http.Get(url)
+	req, err := http.NewRequest("GET", url, nil)
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Authorization", bearerKey)
 	if err != nil {
 		fmt.Println("Error creating request:", err)
 		return
 	}
 	client := &http.Client{}
 	resp, err := client.Do(req)
 	if err != nil {
 		fmt.Println("Error sending request:", err)
 		return
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
@@ -72,14 +83,15 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	return
 }
-func getModels(url string) (response []gallery.GalleryModel) {
+func getModels(url string) ([]gallery.GalleryModel, error) {
 	response := []gallery.GalleryModel{}
 	uri := downloader.URI(url)
 	// TODO: No tests currently seem to exercise file:// urls. Fix?
-	uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+	err := uri.DownloadWithAuthorizationAndCallback("", bearerKey, func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
-	return
+	return response, err
 }
 func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
@@ -101,6 +113,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 		return
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Authorization", bearerKey)
 	// Make the request
 	client := &http.Client{}
@@ -140,6 +153,7 @@ func postRequestJSON[B any](url string, bodyJson *B) error {
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Authorization", bearerKey)
 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -175,6 +189,7 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Authorization", bearerKey)
 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -195,6 +210,35 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	return json.Unmarshal(body, respJson)
 }
 func postInvalidRequest(url string) (error, int) {
 	req, err := http.NewRequest("POST", url, bytes.NewBufferString("invalid request"))
 	if err != nil {
 		return err, -1
 	}
 	req.Header.Set("Content-Type", "application/json")
 	client := &http.Client{}
 	resp, err := client.Do(req)
 	if err != nil {
 		return err, -1
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return err, -1
 	}
 	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
 		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body)), resp.StatusCode
 	}
 	return nil, resp.StatusCode
 }
 //go:embed backend-assets/*
 var backendAssets embed.FS
@@ -260,6 +304,7 @@ var _ = Describe("API test", func() {
 					config.WithContext(c),
 					config.WithGalleries(galleries),
 					config.WithModelPath(modelDir),
 					config.WithApiKeys([]string{apiKey}),
 					config.WithBackendAssets(backendAssets),
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())
@@ -269,7 +314,7 @@ var _ = Describe("API test", func() {
 			go app.Listen("127.0.0.1:9090")
-			defaultConfig := openai.DefaultConfig("")
+			defaultConfig := openai.DefaultConfig(apiKey)
 			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
 			client2 = openaigo.NewClient("")
@@ -295,10 +340,19 @@ var _ = Describe("API test", func() {
 			Expect(err).To(HaveOccurred())
 		})
 		Context("Auth Tests", func() {
 			It("Should fail if the api key is missing", func() {
 				err, sc := postInvalidRequest("http://127.0.0.1:9090/models/available")
 				Expect(err).ToNot(BeNil())
 				Expect(sc).To(Equal(403))
 			})
 		})
 		Context("Applying models", func() {
 			It("applies models from a gallery", func() {
-				models := getModels("http://127.0.0.1:9090/models/available")
+				models, err := getModels("http://127.0.0.1:9090/models/available")
 				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
 				Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
@@ -331,7 +385,8 @@ var _ = Describe("API test", func() {
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 				Expect(content["foo"]).To(Equal("bar"))
-				models = getModels("http://127.0.0.1:9090/models/available")
+				models, err = getModels("http://127.0.0.1:9090/models/available")
 				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
 				Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -19,14 +19,16 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
-
+	if ctx.Query("model") != "" {
 		modelInput = ctx.Query("model")
 	}
 	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
+	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := services.ListModels(cl, loader, "", true)
+		models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(models) > 0 {
 			modelInput = models[0]
 			log.Debug().Msgf("No model specified, using: %s", modelInput)
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -55,7 +55,7 @@ func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 		}
 		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
+		filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -45,13 +45,13 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		if input.Backend != "" {
@@ -64,7 +64,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			Documents: req.Documents,
 		}
-		results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
+		results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/get_token_metrics.go
+++ b/core/http/endpoints/localai/get_token_metrics.go
@@ -0,0 +1,60 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
 //
 //	@Summary	Get TokenMetrics for Active Slot.
 //	@Accept json
 //	@Produce audio/x-wav
 //	@Success	200		{string}	binary				"generated audio/wav file"
 //	@Router		/v1/tokenMetrics [get]
 //	@Router		/tokenMetrics [get]
 func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.TokenMetricsRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Token Metrics for model: %s", modelFile)
 		response, err := backend.TokenMetrics(modelFile, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 		return c.JSON(response)
 	}
 }
--- a/core/http/endpoints/localai/system.go
+++ b/core/http/endpoints/localai/system.go
@@ -17,12 +17,14 @@ func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
 		loadedModels := ml.ListModels()
 		for b := range appConfig.ExternalGRPCBackends {
 			availableBackends = append(availableBackends, b)
 		}
 		return c.JSON(
 			schema.SystemInformationResponse{
 				Backends: availableBackends,
 				Models:   loadedModels,
 			},
 		)
 	}
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@@ -0,0 +1,58 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 // TokenizeEndpoint exposes a REST API to tokenize the content
 // @Summary Tokenize the input.
 // @Success 200 {object} schema.TokenizeResponse "Response"
 // @Router /v1/tokenize [post]
 func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.TokenizeRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
 		if err != nil {
 			return err
 		}
 		c.JSON(tokenResponse)
 		return nil
 	}
 }
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -13,7 +13,7 @@ import (
 func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 	cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		models, _ := services.ListModels(cl, ml, "", true)
+		models, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		backendConfigs := cl.GetAllBackendConfigs()
 		galleryConfigs := map[string]*gallery.Config{}
@@ -32,18 +32,10 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		// Get model statuses to display in the UI the operation in progress
 		processingModels, taskTypes := modelStatus()
 		modelsWithoutConfig := []string{}
 		for _, m := range models {
 			if _, ok := modelsWithBackendConfig[m]; !ok {
 				modelsWithoutConfig = append(modelsWithoutConfig, m)
 			}
 		}
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
-			"Models":            modelsWithoutConfig,
+			"Models":            models,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
 			"IsP2PEnabled":      p2p.IsP2PEnabled(),
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -225,7 +225,7 @@ func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
 func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
 	found = false
-	models, err := services.ListModels(cl, ml, "", true)
+	models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 	if err != nil {
 		return
 	}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -161,6 +161,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
 		// Set CorrelationID
 		correlationID := c.Get("X-Correlation-ID")
 		if len(strings.TrimSpace(correlationID)) == 0 {
 			correlationID = id
 		}
 		c.Set("X-Correlation-ID", correlationID)
 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
@@ -444,6 +450,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
 			c.Set("X-Correlation-ID", id)
 			responses := make(chan schema.OpenAIResponse)
@@ -640,8 +647,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 	for _, m := range input.Messages {
 		images = append(images, m.StringImages...)
 	}
 	videos := []string{}
 	for _, m := range input.Messages {
 		videos = append(videos, m.StringVideos...)
 	}
 	audios := []string{}
 	for _, m := range input.Messages {
 		audios = append(audios, m.StringAudios...)
 	}
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -57,6 +57,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 	}
 	return func(c *fiber.Ctx) error {
 		// Add Correlation
 		c.Set("X-Correlation-ID", id)
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -27,9 +27,17 @@ func ComputeChoices(
 	for _, m := range req.Messages {
 		images = append(images, m.StringImages...)
 	}
 	videos := []string{}
 	for _, m := range req.Messages {
 		videos = append(videos, m.StringVideos...)
 	}
 	audios := []string{}
 	for _, m := range req.Messages {
 		audios = append(audios, m.StringAudios...)
 	}
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -18,32 +18,32 @@ func ListModelsEndpoint(bcl *config.BackendConfigLoader, ml *model.ModelLoader)
 		filter := c.Query("filter")
 		// By default, exclude any loose files that are already referenced by a configuration file.
-		excludeConfigured := c.QueryBool("excludeConfigured", true)
+		var policy services.LooseFilePolicy
 		if c.QueryBool("excludeConfigured", true) {
 			policy = services.SKIP_IF_CONFIGURED
 		} else {
 			policy = services.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
 		}
-		dataModels, err := modelList(bcl, ml, filter, excludeConfigured)
+		filterFn, err := config.BuildNameFilterFn(filter)
 		if err != nil {
 			return err
 		}
 		modelNames, err := services.ListModels(bcl, ml, filterFn, policy)
 		if err != nil {
 			return err
 		}
 		// Map from a slice of names to a slice of OpenAIModel response objects
 		dataModels := []schema.OpenAIModel{}
 		for _, m := range modelNames {
 			dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 		}
 		return c.JSON(schema.ModelsDataResponse{
 			Object: "list",
 			Data:   dataModels,
 		})
 	}
 }
 func modelList(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]schema.OpenAIModel, error) {
 	models, err := services.ListModels(bcl, ml, filter, excludeConfigured)
 	if err != nil {
 		return nil, err
 	}
 	dataModels := []schema.OpenAIModel{}
 	// Then iterate through the loose files:
 	for _, m := range models {
 		dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 	}
 	return dataModels, nil
 }
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -6,15 +6,22 @@ import (
 	"fmt"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 )
 type correlationIDKeyType string
 // CorrelationIDKey to track request across process boundary
 const CorrelationIDKey correlationIDKeyType = "correlationID"
 func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
 	input := new(schema.OpenAIRequest)
@@ -24,9 +31,14 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
 	}
 	received, _ := json.Marshal(input)
 	// Extract or generate the correlation ID
 	correlationID := c.Get("X-Correlation-ID", uuid.New().String())
 	ctx, cancel := context.WithCancel(o.Context)
-	input.Context = ctx
+	// Add the correlation ID to the new context
 	ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
 	input.Context = ctxWithCorrelationID
 	input.Cancel = cancel
 	log.Debug().Msgf("Request received: %s", string(received))
@@ -135,7 +147,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 	}
 	// Decode each request's message content
-	index := 0
+	imgIndex, vidIndex, audioIndex := 0, 0, 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
@@ -144,20 +156,58 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
 		CONTENT:
 			for _, pp := range c {
-				if pp.Type == "text" {
+				switch pp.Type {
 				case "text":
 					input.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
+				case "video", "video_url":
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					// Decode content as base64 either if it's an URL or base64 text
-					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
+					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
-					if err == nil {
+					if err != nil {
-						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						log.Error().Msgf("Failed encoding video: %s", err)
-						// set a placeholder for each image
+						continue CONTENT
 						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
 						index++
 					} else {
 						log.Error().Msgf("Failed encoding image: %s", err)
 					}
 					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
 					t := "[vid-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Video != "" {
 						t = config.TemplateConfig.Video
 					}
 					// set a placeholder for each image
 					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
 					vidIndex++
 				case "audio_url", "audio":
 					// Decode content as base64 either if it's an URL or base64 text
 					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
 					if err != nil {
 						log.Error().Msgf("Failed encoding image: %s", err)
 						continue CONTENT
 					}
 					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
 					// set a placeholder for each image
 					t := "[audio-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Audio != "" {
 						t = config.TemplateConfig.Audio
 					}
 					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
 					audioIndex++
 				case "image_url", "image":
 					// Decode content as base64 either if it's an URL or base64 text
 					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
 					if err != nil {
 						log.Error().Msgf("Failed encoding image: %s", err)
 						continue CONTENT
 					}
 					t := "[img-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Image != "" {
 						t = config.TemplateConfig.Image
 					}
 					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
 					// set a placeholder for each image
 					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
 					imgIndex++
 				}
 			}
 		}
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -0,0 +1,94 @@
 package middleware
 import (
 	"crypto/subtle"
 	"errors"
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/keyauth"
 	"github.com/mudler/LocalAI/core/config"
 )
 // This file contains the configuration generators and handler functions that are used along with the fiber/keyauth middleware
 // Currently this requires an upstream patch - and feature patches are no longer accepted to v2
 // Therefore `dave-gray101/v2keyauth` contains the v2 backport of the middleware until v3 stabilizes and we migrate.
 func GetKeyAuthConfig(applicationConfig *config.ApplicationConfig) (*v2keyauth.Config, error) {
 	customLookup, err := v2keyauth.MultipleKeySourceLookup([]string{"header:Authorization", "header:x-api-key", "header:xi-api-key"}, keyauth.ConfigDefault.AuthScheme)
 	if err != nil {
 		return nil, err
 	}
 	return &v2keyauth.Config{
 		CustomKeyLookup: customLookup,
 		Next:            getApiKeyRequiredFilterFunction(applicationConfig),
 		Validator:       getApiKeyValidationFunction(applicationConfig),
 		ErrorHandler:    getApiKeyErrorHandler(applicationConfig),
 		AuthScheme:      "Bearer",
 	}, nil
 }
 func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.ErrorHandler {
 	return func(ctx *fiber.Ctx, err error) error {
 		if errors.Is(err, v2keyauth.ErrMissingOrMalformedAPIKey) {
 			if len(applicationConfig.ApiKeys) == 0 {
 				return ctx.Next() // if no keys are set up, any error we get here is not an error.
 			}
 			if applicationConfig.OpaqueErrors {
 				return ctx.SendStatus(403)
 			}
 			return ctx.Status(403).SendString(err.Error())
 		}
 		if applicationConfig.OpaqueErrors {
 			return ctx.SendStatus(500)
 		}
 		return err
 	}
 }
 func getApiKeyValidationFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx, string) (bool, error) {
 	if applicationConfig.UseSubtleKeyComparison {
 		return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
 			if len(applicationConfig.ApiKeys) == 0 {
 				return true, nil // If no keys are setup, accept everything
 			}
 			for _, validKey := range applicationConfig.ApiKeys {
 				if subtle.ConstantTimeCompare([]byte(apiKey), []byte(validKey)) == 1 {
 					return true, nil
 				}
 			}
 			return false, v2keyauth.ErrMissingOrMalformedAPIKey
 		}
 	}
 	return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
 		if len(applicationConfig.ApiKeys) == 0 {
 			return true, nil // If no keys are setup, accept everything
 		}
 		for _, validKey := range applicationConfig.ApiKeys {
 			if apiKey == validKey {
 				return true, nil
 			}
 		}
 		return false, v2keyauth.ErrMissingOrMalformedAPIKey
 	}
 }
 func getApiKeyRequiredFilterFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx) bool {
 	if applicationConfig.DisableApiKeyRequirementForHttpGet {
 		return func(c *fiber.Ctx) bool {
 			if c.Method() != "GET" {
 				return false
 			}
 			for _, rx := range applicationConfig.HttpGetExemptedEndpoints {
 				if rx.MatchString(c.Path()) {
 					return true
 				}
 			}
 			return false
 		}
 	}
 	return func(c *fiber.Ctx) bool { return false }
 }
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -10,12 +10,11 @@ import (
 func RegisterElevenLabsRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
+	appConfig *config.ApplicationConfig) {
 	auth func(*fiber.Ctx) error) {
 	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/text-to-speech/:voice-id", elevenlabs.TTSEndpoint(cl, ml, appConfig))
-	app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
+	app.Post("/v1/sound-generation", elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/health.go
+++ b/core/http/routes/health.go
@@ -0,0 +1,13 @@
 package routes
 import "github.com/gofiber/fiber/v2"
 func HealthRoutes(app *fiber.App) {
 	// Service health checks
 	ok := func(c *fiber.Ctx) error {
 		return c.SendStatus(200)
 	}
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)
 }
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -11,8 +11,7 @@ import (
 func RegisterJINARoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
+	appConfig *config.ApplicationConfig) {
 	auth func(*fiber.Ctx) error) {
 	// POST endpoint to mimic the reranking
 	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -15,61 +15,55 @@ func RegisterLocalAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
+	galleryService *services.GalleryService) {
 	auth func(*fiber.Ctx) error) {
 	app.Get("/swagger/*", swagger.HandlerDefault) // default
 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
 		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-		app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+		app.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-		app.Post("/models/delete/:name", auth, modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+		app.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
-		app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+		app.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-		app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
+		app.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
-		app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
+		app.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
-		app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+		app.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-		app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
+		app.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
-		app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}
-	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
 	// Stores
 	sl := model.NewModelLoader("")
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+	app.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
-	// Kubernetes health checks
+	app.Get("/metrics", localai.LocalAIMetricsEndpoint())
 	ok := func(c *fiber.Ctx) error {
 		return c.SendStatus(200)
 	}
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)
 	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
 	// Experimental Backend Statistics Module
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitorService))
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitorService))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", auth, localai.ShowP2PNodes(appConfig))
+		app.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
-		app.Get("/api/p2p/token", auth, localai.ShowP2PToken(appConfig))
+		app.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
 	}
-	app.Get("/version", auth, func(c *fiber.Ctx) error {
+	app.Get("/version", func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})
-	app.Get("/system", auth, localai.SystemInformations(ml, appConfig))
+	app.Get("/system", localai.SystemInformations(ml, appConfig))
 	// misc
 	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -11,66 +11,65 @@ import (
 func RegisterOpenAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
+	appConfig *config.ApplicationConfig) {
 	auth func(*fiber.Ctx) error) {
 	// openAI compatible API endpoint
 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))
 	// assistant
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
 	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/v1/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))
 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))
 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))
 	if appConfig.ImageDir != "" {
 		app.Static("/generated-images", appConfig.ImageDir)
@@ -81,6 +80,6 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	}
 	// List models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/models", openai.ListModelsEndpoint(cl, ml))
 }
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -59,8 +59,7 @@ func RegisterUIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
+	galleryService *services.GalleryService) {
 	auth func(*fiber.Ctx) error) {
 	// keeps the state of models that are being installed from the UI
 	var processingModels = NewModelOpCache()
@@ -85,10 +84,10 @@ func RegisterUIRoutes(app *fiber.App,
 		return processingModelsData, taskTypes
 	}
-	app.Get("/", auth, localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))
+	app.Get("/", localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))
 	if p2p.IsP2PEnabled() {
-		app.Get("/p2p", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p", func(c *fiber.Ctx) error {
 			summary := fiber.Map{
 				"Title":   "LocalAI - P2P dashboard",
 				"Version": internal.PrintableVersion(),
@@ -104,17 +103,17 @@ func RegisterUIRoutes(app *fiber.App,
 		})
 		/* show nodes live! */
-		app.Get("/p2p/ui/workers", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})
-		app.Get("/p2p/ui/workers-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})
 	}
@@ -122,7 +121,7 @@ func RegisterUIRoutes(app *fiber.App,
 	if !appConfig.DisableGalleryEndpoint {
 		// Show the Models page (all models)
-		app.Get("/browse", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse", func(c *fiber.Ctx) error {
 			term := c.Query("term")
 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
@@ -167,7 +166,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// Show the models, filtered from the user input
 		// https://htmx.org/examples/active-search/
-		app.Post("/browse/search/models", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/search/models", func(c *fiber.Ctx) error {
 			form := struct {
 				Search string `form:"search"`
 			}{}
@@ -188,7 +187,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/install/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/install/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to install  : %+v\n", galleryID)
@@ -215,7 +214,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/delete/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/delete/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to delete  : %+v\n", galleryID)
 			var galleryName = galleryID
@@ -255,7 +254,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// Display the job current progress status
 		// If the job is done, we trigger the /browse/job/:uid route
 		// https://htmx.org/examples/progress-bar/
-		app.Get("/browse/job/progress/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/progress/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!
 			status := galleryService.GetStatus(jobUID)
@@ -279,7 +278,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// this route is hit when the job is done, and we display the
 		// final state (for now just displays "Installation completed")
-		app.Get("/browse/job/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!
 			status := galleryService.GetStatus(jobUID)
@@ -303,8 +302,8 @@ func RegisterUIRoutes(app *fiber.App,
 	}
 	// Show the Chat page
-	app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/chat/:model", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
@@ -318,8 +317,8 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})
-	app.Get("/talk/", auth, func(c *fiber.Ctx) error {
+	app.Get("/talk/", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -338,9 +337,9 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/talk", summary)
 	})
-	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
+	app.Get("/chat/", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -359,7 +358,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})
-	app.Get("/text2image/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
 		summary := fiber.Map{
@@ -374,7 +373,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})
-	app.Get("/text2image/", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
@@ -395,7 +394,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})
-	app.Get("/tts/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
 		summary := fiber.Map{
@@ -410,7 +409,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/tts", summary)
 	})
-	app.Get("/tts/", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
--- a/core/p2p/federated_server.go
+++ b/core/p2p/federated_server.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"net"
 	"github.com/mudler/edgevpn/pkg/node"
@@ -41,7 +42,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		log.Error().Err(err).Msg("Error listening")
 		return err
 	}
-	//	ll.Info("Binding local port on", srcaddr)
+
 	go func() {
 		<-ctx.Done()
 		l.Close()
@@ -82,6 +83,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 				if workerID == "" {
 					log.Error().Msg("No available nodes yet")
 					fs.sendHTMLResponse(conn, 503, "Sorry, waiting for nodes to connect")
 					return
 				}
@@ -89,6 +91,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 				nodeData, exists := GetNode(fs.service, workerID)
 				if !exists {
 					log.Error().Msgf("Node %s not found", workerID)
 					fs.sendHTMLResponse(conn, 404, "Node not found")
 					return
 				}
@@ -100,3 +103,42 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		}
 	}
 }
 // sendHTMLResponse sends a basic HTML response with a status code and a message.
 // This is extracted to make the HTML content maintainable.
 func (fs *FederatedServer) sendHTMLResponse(conn net.Conn, statusCode int, message string) {
 	defer conn.Close()
 	// Define the HTML content separately for easier maintenance.
 	htmlContent := fmt.Sprintf("<html><body><h1>%s</h1></body></html>\r\n", message)
 	// Create the HTTP response with dynamic status code and content.
 	response := fmt.Sprintf(
 		"HTTP/1.1 %d %s\r\n"+
 			"Content-Type: text/html\r\n"+
 			"Connection: close\r\n"+
 			"\r\n"+
 			"%s",
 		statusCode, getHTTPStatusText(statusCode), htmlContent,
 	)
 	// Write the response to the client connection.
 	_, writeErr := io.WriteString(conn, response)
 	if writeErr != nil {
 		log.Error().Err(writeErr).Msg("Error writing response to client")
 	}
 }
 // getHTTPStatusText returns a textual representation of HTTP status codes.
 func getHTTPStatusText(statusCode int) string {
 	switch statusCode {
 	case 503:
 		return "Service Unavailable"
 	case 404:
 		return "Not Found"
 	case 200:
 		return "OK"
 	default:
 		return "Unknown Status"
 	}
 }
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -2,6 +2,7 @@ package schema
 import (
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/pkg/model"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )
@@ -9,6 +10,10 @@ type BackendMonitorRequest struct {
 	Model string `json:"model" yaml:"model"`
 }
 type TokenMetricsRequest struct {
 	Model string `json:"model" yaml:"model"`
 }
 type BackendMonitorResponse struct {
 	MemoryInfo    *gopsutil.MemoryInfoStat
 	MemoryPercent float32
@@ -72,5 +77,6 @@ type P2PNodesResponse struct {
 }
 type SystemInformationResponse struct {
-	Backends []string `json:"backends"`
+	Backends []string      `json:"backends"`
 	Models   []model.Model `json:"loaded_models"`
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -58,6 +58,8 @@ type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
 	AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
 	VideoURL ContentURL `json:"video_url" yaml:"video_url"`
 }
 type ContentURL struct {
@@ -76,6 +78,8 @@ type Message struct {
 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
 	StringVideos  []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
 	StringAudios  []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
--- a/core/schema/tokenize.go
+++ b/core/schema/tokenize.go
@@ -0,0 +1,10 @@
 package schema
 type TokenizeRequest struct {
 	Content string `json:"content"`
 	Model   string `json:"model"`
 }
 type TokenizeResponse struct {
 	Tokens []int32 `json:"tokens"`
 }
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -1,55 +1,47 @@
 package services
 import (
 	"regexp"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/model"
 )
-func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]string, error) {
+type LooseFilePolicy int
-	models, err := ml.ListFilesInModelPath()
+const (
-	if err != nil {
+	SKIP_IF_CONFIGURED LooseFilePolicy = iota
-		return nil, err
+	SKIP_ALWAYS
-	}
+	ALWAYS_INCLUDE
 	LOOSE_ONLY
 )
-	var mm map[string]interface{} = map[string]interface{}{}
+func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
 	var skipMap map[string]interface{} = map[string]interface{}{}
 	dataModels := []string{}
-	var filterFn func(name string) bool
+	// Start with known configurations
-
+	if looseFilePolicy != LOOSE_ONLY {
-	// If filter is not specified, do not filter the list by model name
+		for _, c := range bcl.GetBackendConfigsByFilter(filter) {
-	if filter == "" {
+			if looseFilePolicy == SKIP_IF_CONFIGURED {
-		filterFn = func(_ string) bool { return true }
+				skipMap[c.Model] = nil
-	} else {
+			}
 		// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
 		rxp, err := regexp.Compile(filter)
 		if err != nil {
 			return nil, err
 		}
 		filterFn = func(name string) bool {
 			return rxp.MatchString(name)
 		}
 	}
 	// Start with the known configurations
 	for _, c := range bcl.GetAllBackendConfigs() {
 		if excludeConfigured {
 			mm[c.Model] = nil
 		}
 		if filterFn(c.Name) {
 			dataModels = append(dataModels, c.Name)
 		}
 	}
-	// Then iterate through the loose files:
+	// Then iterate through the loose files if requested.
-	for _, m := range models {
+	if looseFilePolicy != SKIP_ALWAYS {
-		// And only adds them if they shouldn't be skipped.
+
-		if _, exists := mm[m]; !exists && filterFn(m) {
+		models, err := ml.ListFilesInModelPath()
-			dataModels = append(dataModels, m)
+		if err != nil {
 			return nil, err
 		}
 		for _, m := range models {
 			// And only adds them if they shouldn't be skipped.
 			if _, exists := skipMap[m]; !exists && filter(m, nil) {
 				dataModels = append(dataModels, m)
 			}
 		}
 	}
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -1,206 +1,237 @@
-package startup
+package startup
-
+
-import (
+import (
-	"fmt"
+	"fmt"
-	"os"
+	"os"
-
+
-	"github.com/mudler/LocalAI/core"
+	"github.com/mudler/LocalAI/core"
-	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/assets"
+	"github.com/mudler/LocalAI/internal"
-	"github.com/mudler/LocalAI/pkg/library"
+	"github.com/mudler/LocalAI/pkg/assets"
-	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/library"
-	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
+	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
-	"github.com/rs/zerolog/log"
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
-)
+	"github.com/rs/zerolog/log"
-
+)
-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+
-	options := config.NewApplicationConfig(opts...)
+func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
-
+	options := config.NewApplicationConfig(opts...)
-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
+
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
-	caps, err := xsysinfo.CPUCapabilities()
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-	if err == nil {
+	caps, err := xsysinfo.CPUCapabilities()
-		log.Debug().Msgf("CPU capabilities: %v", caps)
+	if err == nil {
-	}
+		log.Debug().Msgf("CPU capabilities: %v", caps)
-	gpus, err := xsysinfo.GPUs()
+	}
-	if err == nil {
+	gpus, err := xsysinfo.GPUs()
-		log.Debug().Msgf("GPU count: %d", len(gpus))
+	if err == nil {
-		for _, gpu := range gpus {
+		log.Debug().Msgf("GPU count: %d", len(gpus))
-			log.Debug().Msgf("GPU: %s", gpu.String())
+		for _, gpu := range gpus {
-		}
+			log.Debug().Msgf("GPU: %s", gpu.String())
-	}
+		}
-
+	}
-	// Make sure directories exists
+
-	if options.ModelPath == "" {
+	// Make sure directories exists
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+	if options.ModelPath == "" {
-	}
+		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
-	err = os.MkdirAll(options.ModelPath, 0750)
+	}
-	if err != nil {
+	err = os.MkdirAll(options.ModelPath, 0750)
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+	if err != nil {
-	}
+		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
-	if options.ImageDir != "" {
+	}
-		err := os.MkdirAll(options.ImageDir, 0750)
+	if options.ImageDir != "" {
-		if err != nil {
+		err := os.MkdirAll(options.ImageDir, 0750)
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+		if err != nil {
-		}
+			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
-	}
+		}
-	if options.AudioDir != "" {
+	}
-		err := os.MkdirAll(options.AudioDir, 0750)
+	if options.AudioDir != "" {
-		if err != nil {
+		err := os.MkdirAll(options.AudioDir, 0750)
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+		if err != nil {
-		}
+			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
-	}
+		}
-	if options.UploadDir != "" {
+	}
-		err := os.MkdirAll(options.UploadDir, 0750)
+	if options.UploadDir != "" {
-		if err != nil {
+		err := os.MkdirAll(options.UploadDir, 0750)
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+		if err != nil {
-		}
+			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
-	}
+		}
-
+	}
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+
-		log.Error().Err(err).Msg("error installing models")
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
-	}
+		log.Error().Err(err).Msg("error installing models")
-
+	}
-	cl := config.NewBackendConfigLoader(options.ModelPath)
+
-	ml := model.NewModelLoader(options.ModelPath)
+	cl := config.NewBackendConfigLoader(options.ModelPath)
-
+	ml := model.NewModelLoader(options.ModelPath)
-	configLoaderOpts := options.ToConfigLoaderOptions()
+
-
+	configLoaderOpts := options.ToConfigLoaderOptions()
-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+
-		log.Error().Err(err).Msg("error loading config files")
+	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
-	}
+		log.Error().Err(err).Msg("error loading config files")
-
+	}
-	if options.ConfigFile != "" {
+
-		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
+	if options.ConfigFile != "" {
-			log.Error().Err(err).Msg("error loading config file")
+		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
-		}
+			log.Error().Err(err).Msg("error loading config file")
-	}
+		}
-
+	}
-	if err := cl.Preload(options.ModelPath); err != nil {
+
-		log.Error().Err(err).Msg("error downloading models")
+	if err := cl.Preload(options.ModelPath); err != nil {
-	}
+		log.Error().Err(err).Msg("error downloading models")
-
+	}
-	if options.PreloadJSONModels != "" {
+
-		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
+	if options.PreloadJSONModels != "" {
-			return nil, nil, nil, err
+		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
-		}
+			return nil, nil, nil, err
-	}
+		}
-
+	}
-	if options.PreloadModelsFromPath != "" {
+
-		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
+	if options.PreloadModelsFromPath != "" {
-			return nil, nil, nil, err
+		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
-		}
+			return nil, nil, nil, err
-	}
+		}
-
+	}
-	if options.Debug {
+
-		for _, v := range cl.GetAllBackendConfigs() {
+	if options.Debug {
-			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
+		for _, v := range cl.GetAllBackendConfigs() {
-		}
+			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
-	}
+		}
-
+	}
-	if options.AssetsDestination != "" {
+
-		// Extract files from the embedded FS
+	if options.AssetsDestination != "" {
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		// Extract files from the embedded FS
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		if err != nil {
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
+		if err != nil {
-		}
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
-	}
+		}
-
+	}
-	if options.LibPath != "" {
+
-		// If there is a lib directory, set LD_LIBRARY_PATH to include it
+	if options.LibPath != "" {
-		err := library.LoadExternal(options.LibPath)
+		// If there is a lib directory, set LD_LIBRARY_PATH to include it
-		if err != nil {
+		err := library.LoadExternal(options.LibPath)
-			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
+		if err != nil {
-		}
+			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
-	}
+		}
-
+	}
-	// turn off any process that was started by GRPC if the context is canceled
+
-	go func() {
+	// turn off any process that was started by GRPC if the context is canceled
-		<-options.Context.Done()
+	go func() {
-		log.Debug().Msgf("Context canceled, shutting down")
+		<-options.Context.Done()
-		err := ml.StopAllGRPC()
+		log.Debug().Msgf("Context canceled, shutting down")
-		if err != nil {
+		err := ml.StopAllGRPC()
-			log.Error().Err(err).Msg("error while stopping all grpc backends")
+		if err != nil {
-		}
+			log.Error().Err(err).Msg("error while stopping all grpc backends")
-	}()
+		}
-
+	}()
-	if options.WatchDog {
+
-		wd := model.NewWatchDog(
+	if options.WatchDog {
-			ml,
+		wd := model.NewWatchDog(
-			options.WatchDogBusyTimeout,
+			ml,
-			options.WatchDogIdleTimeout,
+			options.WatchDogBusyTimeout,
-			options.WatchDogBusy,
+			options.WatchDogIdleTimeout,
-			options.WatchDogIdle)
+			options.WatchDogBusy,
-		ml.SetWatchDog(wd)
+			options.WatchDogIdle)
-		go wd.Run()
+		ml.SetWatchDog(wd)
-		go func() {
+		go wd.Run()
-			<-options.Context.Done()
+		go func() {
-			log.Debug().Msgf("Context canceled, shutting down")
+			<-options.Context.Done()
-			wd.Shutdown()
+			log.Debug().Msgf("Context canceled, shutting down")
-		}()
+			wd.Shutdown()
-	}
+		}()
-
+	}
-	// Watch the configuration directory
+
-	startWatcher(options)
+	if options.LoadToMemory != nil {
-
+		for _, m := range options.LoadToMemory {
-	log.Info().Msg("core/startup process completed!")
+			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
-	return cl, ml, options, nil
+				config.LoadOptionDebug(options.Debug),
-}
+				config.LoadOptionThreads(options.Threads),
-
+				config.LoadOptionContextSize(options.ContextSize),
-func startWatcher(options *config.ApplicationConfig) {
+				config.LoadOptionF16(options.F16),
-	if options.DynamicConfigsDir == "" {
+				config.ModelPath(options.ModelPath),
-		// No need to start the watcher if the directory is not set
+			)
-		return
+			if err != nil {
-	}
+				return nil, nil, nil, err
-
+			}
-	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
+
-		if os.IsNotExist(err) {
+			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
-			// We try to create the directory if it does not exist and was specified
+
-			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
+			o := backend.ModelOptions(*cfg, options, []model.Option{})
-				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
+
-			}
+			var backendErr error
-		} else {
+			if cfg.Backend != "" {
-			// something else happened, we log the error and don't start the watcher
+				o = append(o, model.WithBackendString(cfg.Backend))
-			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
+				_, backendErr = ml.BackendLoader(o...)
-			return
+			} else {
-		}
+				_, backendErr = ml.GreedyLoader(o...)
-	}
+			}
-
+			if backendErr != nil {
-	configHandler := newConfigFileHandler(options)
+				return nil, nil, nil, err
-	if err := configHandler.Watch(); err != nil {
+			}
-		log.Error().Err(err).Msg("failed creating watcher")
+		}
-	}
+	}
-}
+
-
+	// Watch the configuration directory
-// In Lieu of a proper DI framework, this function wires up the Application manually.
+	startWatcher(options)
-// This is in core/startup rather than core/state.go to keep package references clean!
+
-func createApplication(appConfig *config.ApplicationConfig) *core.Application {
+	log.Info().Msg("core/startup process completed!")
-	app := &core.Application{
+	return cl, ml, options, nil
-		ApplicationConfig:   appConfig,
+}
-		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
+
-		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
+func startWatcher(options *config.ApplicationConfig) {
-	}
+	if options.DynamicConfigsDir == "" {
-
+		// No need to start the watcher if the directory is not set
-	var err error
+		return
-
+	}
-	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+
-	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
-	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+		if os.IsNotExist(err) {
-	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+			// We try to create the directory if it does not exist and was specified
-	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
-
+				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
-	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+			}
-	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
+		} else {
-	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
+			// something else happened, we log the error and don't start the watcher
-
+			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
-	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
+			return
-	if err != nil {
+		}
-		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
+	}
-	}
+
-
+	configHandler := newConfigFileHandler(options)
-	return app
+	if err := configHandler.Watch(); err != nil {
-}
+		log.Error().Err(err).Msg("failed creating watcher")
 	}
 }
 // In Lieu of a proper DI framework, this function wires up the Application manually.
 // This is in core/startup rather than core/state.go to keep package references clean!
 func createApplication(appConfig *config.ApplicationConfig) *core.Application {
 	app := &core.Application{
 		ApplicationConfig:   appConfig,
 		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
 		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
 	}
 	var err error
 	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
 	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
 	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
 	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
 	if err != nil {
 		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
 	}
 	return app
 }
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with
 Images with `core` in the tag are smaller and do not contain any python dependencies. 
-{{< tabs tabTotal="6" >}}
+{{< tabs tabTotal="7" >}}
 {{% tab tabName="Vanilla / CPU Images" %}}
 | Description | Quay | Docker Hub                                   |
@@ -227,6 +227,15 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 {{% /tab %}}
 {{% tab tabName="Vulkan Images" %}}
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai: master-vulkan-ffmpeg-core ` | `localai/localai: master-vulkan-ffmpeg-core `                      |
 | Latest tag | `quay.io/go-skynet/local-ai: latest-vulkan-ffmpeg-core ` | `localai/localai: latest-vulkan-ffmpeg-core`                 |
 | Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core`             |
 {{% /tab %}}
 {{< /tabs >}}
 ## See Also
--- a/Show More
+++ b/Show More
`@@ -1,2 +1,2 @@`
	`grpcio==1.66.1`	`grpcio==1.66.2`
	`protobuf`	`protobuf`