fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
feat(vllm): add support for image-to-text and video-to-text (#3729 )
2026-05-23 16:20:01 -04:00 · 2024-10-05 00:07:58 +02:00 · 2024-10-04 23:42:05 +02:00 · 2024-10-04 19:52:43 +02:00 · 2024-10-04 18:32:29 +02:00 · 2024-10-04 09:10:07 +02:00
132 changed files with 2836 additions and 929 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -9,6 +9,7 @@
 # Param 2: email
 #
 config_user() {
+    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
@@ -24,6 +25,7 @@ config_user() {
 # Param 2: remote url
 #
 config_remote() {
+    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    if scan['hasUnsafeFile']:
-        return scan
-    return None
+    # Check if 'hasUnsafeFile' exists in the response
+    if 'hasUnsafeFile' in scan:
+        if scan['hasUnsafeFile']:
+            return scan
+        else:
+            return None
+    else:
+        return None

 download_type, repo_id_or_url = parse_uri(uri)

--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,6 +13,78 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  hipblas-jobs:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: 2
+      matrix:
+        include:
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -39,7 +111,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
@@ -122,29 +194,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -212,26 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -178,13 +178,22 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
+      - name: Dependencies
+        run: |
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)

-
-
 ## Getting Started

 ### Prerequisites
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check

 ## Coding Guidelines

- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.

 ## Testing

@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
-
---
--- a/13
+++ b/13
@@ -297,10 +297,10 @@ COPY .git .
 RUN make prepare

 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas build
-## (both will use CUDA for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
+## (both will use CUDA or hipblas for the actual computation)
+RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@@ -338,9 +338,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ssh less && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        ssh less wget
+# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.

 RUN go install github.com/go-delve/delve/cmd/dlv@latest

--- a/9
+++ b/9
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -359,6 +359,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

+clean-dc: clean
+	cp -r /build/backend-assets /workspace/backend-assets
+
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -465,7 +468,7 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests

-run-e2e-aio:
+run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio

--- a/README.md
+++ b/README.md
@@ -68,9 +68,7 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 [💻 Getting started](https://localai.io/basics/getting_started/index.html)

-## 🔥🔥 Hot topics / Roadmap
-
-[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+## 📰 Latest project news

 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
@@ -83,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121

-Hot topics (looking for contributors):
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+## 🔥🔥 Hot topics (looking for help):
+
+- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
+- Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -26,6 +26,19 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}

  rpc Rerank(RerankRequest) returns (RerankResult) {}
+
+  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
+}
+
+// Define the empty request
+message MetricsRequest {}
+
+message MetricsResponse {
+  int32 slot_id = 1;
+  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
+  float tokens_per_second = 3;
+  int32 tokens_generated = 4;
+  int32 prompt_tokens_processed = 5;
 }

 message RerankRequest {
@@ -134,6 +147,9 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
+  repeated string Videos = 45;
+  repeated string Audios = 46;
+  string CorrelationId = 47;
 }

 // The response message containing the result
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,6 +13,7 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
+#include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
@@ -448,7 +449,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }

@@ -462,7 +463,7 @@ struct llama_server_context
        ctx = llama_init.context;
        if (model == nullptr)
        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }

@@ -470,7 +471,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -489,11 +490,21 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }

+    llama_client_slot* get_active_slot() {
+        for (llama_client_slot& slot : slots) {
+            // Check if the slot is currently processing
+            if (slot.is_processing()) {
+                return &slot;  // Return the active slot
+            }
+        }
+        return nullptr;  // No active slot found
+    }
+
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -812,10 +823,11 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERROR("failed to load image", {
-                            {"slot_id",   slot->id},
-                            {"img_sl_id", img_sl.id}
-                        });
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
+                             __func__,
+                             slot->id,
+                             img_sl.id
+                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -853,12 +865,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -886,7 +898,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

        return true;
    }
@@ -926,7 +938,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -938,7 +950,7 @@ struct llama_server_context
            }
        }

-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
        system_need_update = false;
    }

@@ -1120,7 +1132,7 @@ struct llama_server_context
            }

            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                return false;
            }

@@ -1132,7 +1144,7 @@ struct llama_server_context

    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1371,7 +1383,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1389,7 +1401,7 @@ struct llama_server_context
                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1572,7 +1584,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());

                    continue;
                    // END LOCALAI changes
@@ -1820,10 +1832,11 @@ struct llama_server_context

                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERROR("failed processing images", {
-                            "slot_id", slot.id,
-                            "task_id", slot.task_id,
-                        });
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
+                            __func__,
+                            slot.id,
+                            slot.task_id
+                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1863,10 +1876,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;

-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1876,7 +1889,7 @@ struct llama_server_context

                        slot.ga_i += slot.ga_w / slot.ga_n;

-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1901,11 +1914,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -2103,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();

+    // Add the correlationid to json data
+    data["correlation_id"] = predict->correlationid();
+
    // for each image in the request, add the image data
    //
    for (int i = 0; i < predict->images_size(); i++) {
@@ -2341,6 +2357,11 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);

+                // Log Request Correlation Id
+                LOG_VERBOSE("correlation:", {
+                    { "id", data["correlation_id"] }
+                });
+
                // Send the reply
                writer->Write(reply);

@@ -2364,6 +2385,12 @@ public:
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
+            
+            // Log Request Correlation Id
+            LOG_VERBOSE("correlation:", {
+                { "id", data["correlation_id"] }
+            });
+
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2403,6 +2430,31 @@ public:

        return grpc::Status::OK;
    }
+
+    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
+        llama_client_slot* active_slot = llama.get_active_slot();
+
+        if (active_slot != nullptr) {
+            // Calculate the tokens per second using existing logic
+            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
+
+            // Populate the response with metrics
+            response->set_slot_id(active_slot->id);
+            response->set_prompt_json_for_slot(active_slot->prompt.dump());
+            response->set_tokens_per_second(tokens_per_second);
+            response->set_tokens_generated(active_slot->n_decoded);
+            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
+        } else {
+            // Handle case when no active slot exists
+            response->set_slot_id(0);
+            response->set_prompt_json_for_slot("");
+            response->set_tokens_per_second(0);
+            response->set_tokens_generated(0);
+            response->set_prompt_tokens_processed(0);
+        }
+
+        return grpc::Status::OK;
+    } 
 };

 void RunServer(const std::string& server_address) {
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-TTS==0.22.0
-grpcio==1.66.1
+coqui-tts
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.66.2
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
@@ -18,6 +18,6 @@ python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==4.38.1
+gradio==4.44.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,5 +15,12 @@ installRequirements

 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=$(ls ${MY_DIR}/venv/lib)
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
+PYDIR=python3.10
+pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
+
+if [ ! -d ${pyenv} ]; then
+    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
+    exit 1
+fi
+
+curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/sentencetransformers/backend.py
+++ b/backend/python/sentencetransformers/backend.py
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model
        try:
-            self.model = SentenceTransformer(model_name)
+            self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

--- a/backend/python/sentencetransformers/requirements-cpu.txt
+++ b/backend/python/sentencetransformers/requirements-cpu.txt
@@ -2,5 +2,5 @@ torch
 accelerate
 transformers
 bitsandbytes
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas11.txt
+++ b/backend/python/sentencetransformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas12.txt
+++ b/backend/python/sentencetransformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -4,5 +4,5 @@ torch
 optimum[openvino]
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,5 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
-certifi
+certifi
+datasets
+einops
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -4,4 +4,4 @@ accelerate
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@ import argparse
 import signal
 import sys
 import os
+from typing import List
+from PIL import Image

 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

        try:
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
        return backend_pb2.Result(message="Model loaded successfully", success=True)

    async def Predict(self, request, context):
@@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed

+        # Extract image paths and process images
        prompt = request.Prompt
-        
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)

-        # Generate text
+        # Generate text using the LLM engine
        request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        outputs = self.llm.generate(
+            {
+                "prompt": prompt,
+                "multi_modal_data": {
+                    "image": image_data if image_data else None,
+                    "video": video_data if video_data else None,
+                } if image_data or video_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )

        # Stream the results
        generated_text = ""
@@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if streaming:
            return

+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
        # Sending the final generated text
        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path.
+        
+        Args:
+            image_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            return Image.open(image_path)
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return self.load_video(image_path)
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+        
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Video: The loaded video.
+        """
+        try:
+            video = VideoAsset(name=video_path).np_ndarrays
+            return video
+        except Exception as e:
+            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
+            return None
+
 async def serve(address):
    # Start asyncio gRPC server
    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
+        ensureVenv
+        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
+        if [ ! -d vllm ]; then
+            git clone https://github.com/vllm-project/vllm
+        fi
+        pushd vllm
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
+            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+            VLLM_TARGET_DEVICE=cpu python setup.py install
+        popd
+        rm -rf vllm
+    else
+        installRequirements
+fi
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,4 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+bitsandbytes
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -10,20 +10,11 @@ import (
 )

 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
-	modelFile := backendConfig.Model
-
-	grpcOpts := gRPCModelOpts(backendConfig)

 	var inferenceModel interface{}
 	var err error

-	opts := modelOpts(backendConfig, appConfig, []model.Option{
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(*backendConfig.Threads)),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-	})
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})

 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -8,19 +8,8 @@ import (
 )

 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-	threads := backendConfig.Threads
-	if *threads == 0 && appConfig.Threads != 0 {
-		threads = &appConfig.Threads
-	}
-	gRPCOpts := gRPCModelOpts(backendConfig)
-	opts := modelOpts(backendConfig, appConfig, []model.Option{
-		model.WithBackendString(backendConfig.Backend),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithThreads(uint32(*threads)),
-		model.WithContext(appConfig.Context),
-		model.WithModel(backendConfig.Model),
-		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
-	})
+
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})

 	inferenceModel, err := loader.BackendLoader(
 		opts...,
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -31,24 +31,13 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
-	threads := c.Threads
-	if *threads == 0 && o.Threads != 0 {
-		threads = &o.Threads
-	}
-	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel grpc.Backend
 	var err error

-	opts := modelOpts(c, o, []model.Option{
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
-		model.WithAssetDir(o.AssetsDestination),
-		model.WithModel(modelFile),
-		model.WithContext(o.Context),
-	})
+	opts := ModelOptions(c, o, []model.Option{})

 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -101,6 +90,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
+		opts.Videos = videos
+		opts.Audios = audios

 		tokenUsage := TokenUsage{}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -11,32 +11,65 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+	name := c.Name
+	if name == "" {
+		name = c.Model
+	}
+
+	defOpts := []model.Option{
+		model.WithBackendString(c.Backend),
+		model.WithModel(c.Model),
+		model.WithAssetDir(so.AssetsDestination),
+		model.WithContext(so.Context),
+		model.WithModelID(name),
+	}
+
+	threads := 1
+
+	if c.Threads != nil {
+		threads = *c.Threads
+	}
+
+	if so.Threads != 0 {
+		threads = so.Threads
+	}
+
+	c.Threads = &threads
+
+	grpcOpts := grpcModelOpts(c)
+	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
+
 	if so.SingleBackend {
-		opts = append(opts, model.WithSingleActiveBackend())
+		defOpts = append(defOpts, model.WithSingleActiveBackend())
 	}

 	if so.ParallelBackendRequests {
-		opts = append(opts, model.EnableParallelRequests)
+		defOpts = append(defOpts, model.EnableParallelRequests)
 	}

 	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+		defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}

 	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+		defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}

 	for k, v := range so.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
+		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}

-	return opts
+	return append(defOpts, opts...)
 }

 func getSeed(c config.BackendConfig) int32 {
-	seed := int32(*c.Seed)
+	var seed int32 = config.RAND_SEED
+
+	if c.Seed != nil {
+		seed = int32(*c.Seed)
+	}
+
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
 	}
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }

-func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
+
+	f16 := false
+	if c.F16 != nil {
+		f16 = *c.F16
+	}
+
+	embeddings := false
+	if c.Embeddings != nil {
+		embeddings = *c.Embeddings
+	}
+
+	lowVRAM := false
+	if c.LowVRAM != nil {
+		lowVRAM = *c.LowVRAM
+	}
+
+	mmap := false
+	if c.MMap != nil {
+		mmap = *c.MMap
+	}
+
+	ctxSize := 1024
+	if c.ContextSize != nil {
+		ctxSize = *c.ContextSize
+	}
+
+	mmlock := false
+	if c.MMlock != nil {
+		mmlock = *c.MMlock
+	}
+
+	nGPULayers := 9999999
+	if c.NGPULayers != nil {
+		nGPULayers = *c.NGPULayers
+	}
+
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
@@ -56,14 +125,14 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CFGScale:             c.Diffusers.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
-		F16Memory:            *c.F16,
+		F16Memory:            f16,
 		LoraBase:             c.LoraBase,
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
-		ContextSize:          int32(*c.ContextSize),
+		ContextSize:          int32(ctxSize),
 		Seed:                 getSeed(c),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
@@ -85,16 +154,16 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnBetaSlow:         c.YarnBetaSlow,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		MLock:                *c.MMlock,
+		MLock:                mmlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
 		Type:                 c.ModelType,
 		RopeFreqScale:        c.RopeFreqScale,
 		NUMA:                 c.NUMA,
-		Embeddings:           *c.Embeddings,
-		LowVRAM:              *c.LowVRAM,
-		NGPULayers:           int32(*c.NGPULayers),
-		MMap:                 *c.MMap,
+		Embeddings:           embeddings,
+		LowVRAM:              lowVRAM,
+		NGPULayers:           int32(nGPULayers),
+		MMap:                 mmap,
 		MainGPU:              c.MainGPU,
 		Threads:              int32(*c.Threads),
 		TensorSplit:          c.TensorSplit,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -9,21 +9,9 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )

-func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
-	bb := backend
-	if bb == "" {
-		return nil, fmt.Errorf("backend is required")
-	}
+func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {

-	grpcOpts := gRPCModelOpts(backendConfig)
-
-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
-		model.WithBackendString(bb),
-		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-	})
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
 	rerankModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -13,7 +13,6 @@ import (
 )

 func SoundGeneration(
-	backend string,
 	modelFile string,
 	text string,
 	duration *float32,
@@ -25,18 +24,8 @@ func SoundGeneration(
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
-	if backend == "" {
-		return "", nil, fmt.Errorf("backend is a required parameter")
-	}

-	grpcOpts := gRPCModelOpts(backendConfig)
-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
-		model.WithBackendString(backend),
-		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-	})
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})

 	soundGenModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -0,0 +1,33 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+func TokenMetrics(
+	modelFile string,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
+
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{
+		model.WithModel(modelFile),
+	})
+	model, err := loader.BackendLoader(opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if model == nil {
+		return nil, fmt.Errorf("could not loadmodel model")
+	}
+
+	res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
+
+	return res, err
+}
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -0,0 +1,44 @@
+package backend
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/grpc"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
+
+	modelFile := backendConfig.Model
+
+	var inferenceModel grpc.Backend
+	var err error
+
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{
+		model.WithModel(modelFile),
+	})
+
+	if backendConfig.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		opts = append(opts, model.WithBackendString(backendConfig.Backend))
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+	if err != nil {
+		return schema.TokenizeResponse{}, err
+	}
+
+	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
+	predictOptions.Prompt = s
+
+	// tokenize the string
+	resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
+	if err != nil {
+		return schema.TokenizeResponse{}, err
+	}
+
+	return schema.TokenizeResponse{
+		Tokens: resp.Tokens,
+	}, nil
+
+}
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -14,13 +14,11 @@ import (

 func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

-	opts := modelOpts(backendConfig, appConfig, []model.Option{
-		model.WithBackendString(model.WhisperBackend),
-		model.WithModel(backendConfig.Model),
-		model.WithContext(appConfig.Context),
-		model.WithThreads(uint32(*backendConfig.Threads)),
-		model.WithAssetDir(appConfig.AssetsDestination),
-	})
+	if backendConfig.Backend == "" {
+		backendConfig.Backend = model.WhisperBackend
+	}
+
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})

 	transcriptionModel, err := ml.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -28,14 +28,9 @@ func ModelTTS(
 		bb = model.PiperBackend
 	}

-	grpcOpts := gRPCModelOpts(backendConfig)
-
-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+	opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	ttsModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -41,31 +41,35 @@ type RunCMD struct {
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
 	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

-	Address                string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
-	CORS                   bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
-	CORSAllowOrigins       string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
-	LibraryPath            string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
-	CSRF                   bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
-	UploadLimit            int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
-	APIKeys                []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI           bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
-	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
-	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
-	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
-	Peer2PeerDHTInterval   int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
-	Peer2PeerOTPInterval   int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
-	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
-	Peer2PeerNetworkID     string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
-	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
-	SingleActiveBackend    bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
-	PreloadBackendOnly     bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
-	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
-	EnableWatchdogIdle     bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
-	WatchdogIdleTimeout    string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
-	EnableWatchdogBusy     bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
-	WatchdogBusyTimeout    string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
-	Federated              bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
-	DisableGalleryEndpoint bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
+	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORSAllowOrigins                   string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	LibraryPath                        string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
+	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
+	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
+	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
+	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
+	DisableApiKeyRequirementForHttpGet bool     `env:"LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET" default:"false" help:"If true, a valid API key is not required to issue GET requests to portions of the web ui. This should only be enabled in secure testing environments" group:"hardening"`
+	HttpGetExemptedEndpoints           []string `env:"LOCALAI_HTTP_GET_EXEMPTED_ENDPOINTS" default:"^/$,^/browse/?$,^/talk/?$,^/p2p/?$,^/chat/?$,^/text2image/?$,^/tts/?$,^/static/.*$,^/swagger.*$" help:"If LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET is overriden to true, this is the list of endpoints to exempt. Only adjust this in case of a security incident or as a result of a personal security posture review" group:"hardening"`
+	Peer2Peer                          bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
+	Peer2PeerDHTInterval               int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
+	Peer2PeerOTPInterval               int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
+	Peer2PeerToken                     string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+	Peer2PeerNetworkID                 string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
+	ParallelRequests                   bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
+	SingleActiveBackend                bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	PreloadBackendOnly                 bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
+	ExternalGRPCBackends               []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
+	EnableWatchdogIdle                 bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
+	WatchdogIdleTimeout                string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
+	EnableWatchdogBusy                 bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
+	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
+	DisableGalleryEndpoint             bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
+	LoadToMemory                       []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
 }

 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -97,7 +101,11 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
 		config.WithOpaqueErrors(r.OpaqueErrors),
 		config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
+		config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
+		config.WithDisableApiKeyRequirementForHttpGet(r.DisableApiKeyRequirementForHttpGet),
+		config.WithHttpGetExemptedEndpoints(r.HttpGetExemptedEndpoints),
 		config.WithP2PNetworkID(r.Peer2PeerNetworkID),
+		config.WithLoadToMemory(r.LoadToMemory),
 	}

 	token := ""
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -85,13 +85,14 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {

 	options := config.BackendConfig{}
 	options.SetDefaults()
+	options.Backend = t.Backend

 	var inputFile *string
 	if t.InputFile != "" {
 		inputFile = &t.InputFile
 	}

-	filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
+	filePath, _, err := backend.SoundGeneration(t.Model, text,
 		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
 		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)

--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -15,8 +15,9 @@ import (
 )

 type UtilCMD struct {
-	GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
-	HFScan   HFScanCMD   `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
+	GGUFInfo         GGUFInfoCMD         `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
+	HFScan           HFScanCMD           `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
+	UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
 }

 type GGUFInfoCMD struct {
@@ -30,6 +31,11 @@ type HFScanCMD struct {
 	ToScan     []string `arg:""`
 }

+type UsecaseHeuristicCMD struct {
+	ConfigName string `name:"The config file to check"`
+	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+}
+
 func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	if u.Args == nil || len(u.Args) == 0 {
 		return fmt.Errorf("no GGUF file provided")
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
 		return nil
 	}
 }
+
+func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
+	if len(uhcmd.ConfigName) == 0 {
+		log.Error().Msg("ConfigName is a required parameter")
+		return fmt.Errorf("config name is a required parameter")
+	}
+	if len(uhcmd.ModelsPath) == 0 {
+		log.Error().Msg("ModelsPath is a required parameter")
+		return fmt.Errorf("model path is a required parameter")
+	}
+	bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
+	err := bcl.LoadBackendConfig(uhcmd.ConfigName)
+	if err != nil {
+		log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
+		return err
+	}
+	bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
+	if !exists {
+		log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
+	}
+	for name, uc := range config.GetAllBackendConfigUsecases() {
+		if bc.HasUsecases(uc) {
+			log.Info().Str("Usecase", name)
+		}
+	}
+	log.Info().Msg("---")
+	return nil
+}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
+	"regexp"
 	"time"

 	"github.com/mudler/LocalAI/pkg/xsysinfo"
@@ -16,7 +17,6 @@ type ApplicationConfig struct {
 	ModelPath                           string
 	LibPath                             string
 	UploadLimitMB, Threads, ContextSize int
-	DisableWebUI                        bool
 	F16                                 bool
 	Debug                               bool
 	ImageDir                            string
@@ -31,11 +31,18 @@ type ApplicationConfig struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
-	EnforcePredownloadScans             bool
-	OpaqueErrors                        bool
 	P2PToken                            string
 	P2PNetworkID                        string

+	DisableWebUI                       bool
+	EnforcePredownloadScans            bool
+	OpaqueErrors                       bool
+	UseSubtleKeyComparison             bool
+	DisableApiKeyRequirementForHttpGet bool
+	HttpGetExemptedEndpoints           []*regexp.Regexp
+	DisableGalleryEndpoint             bool
+	LoadToMemory                       []string
+
 	ModelLibraryURL string

 	Galleries []Gallery
@@ -57,8 +64,6 @@ type ApplicationConfig struct {
 	ModelsURL []string

 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
-
-	DisableGalleryEndpoint bool
 }

 type AppOption func(*ApplicationConfig)
@@ -327,6 +332,38 @@ func WithOpaqueErrors(opaque bool) AppOption {
 	}
 }

+func WithLoadToMemory(models []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.LoadToMemory = models
+	}
+}
+
+func WithSubtleKeyComparison(subtle bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.UseSubtleKeyComparison = subtle
+	}
+}
+
+func WithDisableApiKeyRequirementForHttpGet(required bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DisableApiKeyRequirementForHttpGet = required
+	}
+}
+
+func WithHttpGetExemptedEndpoints(endpoints []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.HttpGetExemptedEndpoints = []*regexp.Regexp{}
+		for _, epr := range endpoints {
+			r, err := regexp.Compile(epr)
+			if err == nil && r != nil {
+				o.HttpGetExemptedEndpoints = append(o.HttpGetExemptedEndpoints, r)
+			} else {
+				log.Warn().Err(err).Str("regex", epr).Msg("Error while compiling HTTP Get Exemption regex, skipping this entry.")
+			}
+		}
+	}
+}
+
 // ToConfigLoaderOptions returns a slice of ConfigLoader Option.
 // Some options defined at the application level are going to be passed as defaults for
 // all the configuration for the models.
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -3,11 +3,13 @@ package config
 import (
 	"os"
 	"regexp"
+	"slices"
 	"strings"

 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"gopkg.in/yaml.v3"
 )

 const (
@@ -27,13 +29,15 @@ type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`

-	F16            *bool             `yaml:"f16"`
-	Threads        *int              `yaml:"threads"`
-	Debug          *bool             `yaml:"debug"`
-	Roles          map[string]string `yaml:"roles"`
-	Embeddings     *bool             `yaml:"embeddings"`
-	Backend        string            `yaml:"backend"`
-	TemplateConfig TemplateConfig    `yaml:"template"`
+	F16                 *bool                  `yaml:"f16"`
+	Threads             *int                   `yaml:"threads"`
+	Debug               *bool                  `yaml:"debug"`
+	Roles               map[string]string      `yaml:"roles"`
+	Embeddings          *bool                  `yaml:"embeddings"`
+	Backend             string                 `yaml:"backend"`
+	TemplateConfig      TemplateConfig         `yaml:"template"`
+	KnownUsecaseStrings []string               `yaml:"known_usecases"`
+	KnownUsecases       *BackendConfigUsecases `yaml:"-"`

 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -192,6 +196,21 @@ type TemplateConfig struct {
 	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
 	// It defaults to \n
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
+
+	Video string `yaml:"video"`
+	Image string `yaml:"image"`
+	Audio string `yaml:"audio"`
+}
+
+func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
+	type BCAlias BackendConfig
+	var aux BCAlias
+	if err := value.Decode(&aux); err != nil {
+		return err
+	}
+	*c = BackendConfig(aux)
+	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
+	return nil
 }

 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -410,3 +429,121 @@ func (c *BackendConfig) Validate() bool {
 func (c *BackendConfig) HasTemplate() bool {
 	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
 }
+
+type BackendConfigUsecases int
+
+const (
+	FLAG_ANY              BackendConfigUsecases = 0b000000000
+	FLAG_CHAT             BackendConfigUsecases = 0b000000001
+	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
+	FLAG_EDIT             BackendConfigUsecases = 0b000000100
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
+	FLAG_RERANK           BackendConfigUsecases = 0b000010000
+	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
+	FLAG_TTS              BackendConfigUsecases = 0b010000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
+
+	// Common Subsets
+	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
+)
+
+func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
+	return map[string]BackendConfigUsecases{
+		"FLAG_ANY":              FLAG_ANY,
+		"FLAG_CHAT":             FLAG_CHAT,
+		"FLAG_COMPLETION":       FLAG_COMPLETION,
+		"FLAG_EDIT":             FLAG_EDIT,
+		"FLAG_EMBEDDINGS":       FLAG_EMBEDDINGS,
+		"FLAG_RERANK":           FLAG_RERANK,
+		"FLAG_IMAGE":            FLAG_IMAGE,
+		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
+		"FLAG_TTS":              FLAG_TTS,
+		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
+		"FLAG_LLM":              FLAG_LLM,
+	}
+}
+
+func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
+	if len(input) == 0 {
+		return nil
+	}
+	result := FLAG_ANY
+	flags := GetAllBackendConfigUsecases()
+	for _, str := range input {
+		flag, exists := flags["FLAG_"+strings.ToUpper(str)]
+		if exists {
+			result |= flag
+		}
+	}
+	return &result
+}
+
+// HasUsecases examines a BackendConfig and determines which endpoints have a chance of success.
+func (c *BackendConfig) HasUsecases(u BackendConfigUsecases) bool {
+	if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
+		return true
+	}
+	return c.GuessUsecases(u)
+}
+
+// GuessUsecases is a **heuristic based** function, as the backend in question may not be loaded yet, and the config may not record what it's useful at.
+// In its current state, this function should ideally check for properties of the config like templates, rather than the direct backend name checks for the lower half.
+// This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
+func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
+	if (u & FLAG_CHAT) == FLAG_CHAT {
+		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
+			return false
+		}
+	}
+	if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
+		if c.TemplateConfig.Completion == "" {
+			return false
+		}
+	}
+	if (u & FLAG_EDIT) == FLAG_EDIT {
+		if c.TemplateConfig.Edit == "" {
+			return false
+		}
+	}
+	if (u & FLAG_EMBEDDINGS) == FLAG_EMBEDDINGS {
+		if c.Embeddings == nil || !*c.Embeddings {
+			return false
+		}
+	}
+	if (u & FLAG_IMAGE) == FLAG_IMAGE {
+		imageBackends := []string{"diffusers", "tinydream", "stablediffusion"}
+		if !slices.Contains(imageBackends, c.Backend) {
+			return false
+		}
+
+		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
+			return false
+		}
+
+	}
+	if (u & FLAG_RERANK) == FLAG_RERANK {
+		if c.Backend != "rerankers" {
+			return false
+		}
+	}
+	if (u & FLAG_TRANSCRIPT) == FLAG_TRANSCRIPT {
+		if c.Backend != "whisper" {
+			return false
+		}
+	}
+	if (u & FLAG_TTS) == FLAG_TTS {
+		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		if !slices.Contains(ttsBackends, c.Backend) {
+			return false
+		}
+	}
+
+	if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
+		if c.Backend != "transformers-musicgen" {
+			return false
+		}
+	}
+
+	return true
+}
--- a/core/config/backend_config_filter.go
+++ b/core/config/backend_config_filter.go
@@ -0,0 +1,35 @@
+package config
+
+import "regexp"
+
+type BackendConfigFilterFn func(string, *BackendConfig) bool
+
+func NoFilterFn(_ string, _ *BackendConfig) bool { return true }
+
+func BuildNameFilterFn(filter string) (BackendConfigFilterFn, error) {
+	if filter == "" {
+		return NoFilterFn, nil
+	}
+	rxp, err := regexp.Compile(filter)
+	if err != nil {
+		return nil, err
+	}
+	return func(name string, config *BackendConfig) bool {
+		if config != nil {
+			return rxp.MatchString(config.Name)
+		}
+		return rxp.MatchString(name)
+	}, nil
+}
+
+func BuildUsecaseFilterFn(usecases BackendConfigUsecases) BackendConfigFilterFn {
+	if usecases == FLAG_ANY {
+		return NoFilterFn
+	}
+	return func(name string, config *BackendConfig) bool {
+		if config == nil {
+			return false // TODO: Potentially make this a param, for now, no known usecase to include
+		}
+		return config.HasUsecases(usecases)
+	}
+}
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -201,6 +201,26 @@ func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
 	return res
 }

+func (bcl *BackendConfigLoader) GetBackendConfigsByFilter(filter BackendConfigFilterFn) []BackendConfig {
+	bcl.Lock()
+	defer bcl.Unlock()
+	var res []BackendConfig
+
+	if filter == nil {
+		filter = NoFilterFn
+	}
+
+	for n, v := range bcl.configs {
+		if filter(n, &v) {
+			res = append(res, v)
+		}
+	}
+
+	// TODO: I don't think this one needs to Sort on name... but we'll see what breaks.
+
+	return res
+}
+
 func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
 	bcl.Lock()
 	defer bcl.Unlock()
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -19,12 +19,17 @@ var _ = Describe("Test cases for config related functions", func() {
 				`backend: "../foo-bar"
 name: "foo"
 parameters:
-  model: "foo-bar"`)
+  model: "foo-bar"
+known_usecases:
+- chat
+- COMPLETION
+`)
 			Expect(err).ToNot(HaveOccurred())
 			config, err := readBackendConfigFromFile(tmp.Name())
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			Expect(config.Validate()).To(BeFalse())
+			Expect(config.KnownUsecases).ToNot(BeNil())
 		})
 		It("Test Validate", func() {
 			tmp, err := os.CreateTemp("", "config.yaml")
@@ -61,4 +66,99 @@ parameters:
 			Expect(config.Validate()).To(BeTrue())
 		})
 	})
+	It("Properly handles backend usecase matching", func() {
+
+		a := BackendConfig{
+			Name: "a",
+		}
+		Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
+
+		b := BackendConfig{
+			Name:    "b",
+			Backend: "stablediffusion",
+		}
+		Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
+		Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
+
+		c := BackendConfig{
+			Name:    "c",
+			Backend: "llama-cpp",
+			TemplateConfig: TemplateConfig{
+				Chat: "chat",
+			},
+		}
+		Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
+		Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
+		Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
+
+		d := BackendConfig{
+			Name:    "d",
+			Backend: "llama-cpp",
+			TemplateConfig: TemplateConfig{
+				Chat:       "chat",
+				Completion: "completion",
+			},
+		}
+		Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
+		Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
+		Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
+
+		trueValue := true
+		e := BackendConfig{
+			Name:    "e",
+			Backend: "llama-cpp",
+			TemplateConfig: TemplateConfig{
+				Completion: "completion",
+			},
+			Embeddings: &trueValue,
+		}
+
+		Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
+		Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
+		Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
+		Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
+
+		f := BackendConfig{
+			Name:    "f",
+			Backend: "piper",
+		}
+		Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
+		Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
+
+		g := BackendConfig{
+			Name:    "g",
+			Backend: "whisper",
+		}
+		Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
+		Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
+
+		h := BackendConfig{
+			Name:    "h",
+			Backend: "transformers-musicgen",
+		}
+		Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
+		Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
+		Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
+
+		knownUsecases := FLAG_CHAT | FLAG_COMPLETION
+		i := BackendConfig{
+			Name:    "i",
+			Backend: "whisper",
+			// Earlier test checks parsing, this just needs to set final values
+			KnownUsecases: &knownUsecases,
+		}
+		Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
+		Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
+		Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
+		Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
+
+	})
 })
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -132,7 +132,7 @@ func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*Gal
 func findGalleryURLFromReferenceURL(url string, basePath string) (string, error) {
 	var refFile string
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		refFile = string(d)
 		if len(refFile) == 0 {
 			return fmt.Errorf("invalid reference file at url %s: %s", url, d)
@@ -156,7 +156,7 @@ func getGalleryModels(gallery config.Gallery, basePath string) ([]*GalleryModel,
 	}
 	uri := downloader.URI(gallery.URL)

-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &models)
 	})
 	if err != nil {
--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -69,7 +69,7 @@ type PromptTemplate struct {
 func GetGalleryConfigFromURL(url string, basePath string) (Config, error) {
 	var config Config
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &config)
 	})
 	if err != nil {
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -3,13 +3,15 @@ package http
 import (
 	"embed"
 	"errors"
+	"fmt"
 	"net/http"
-	"strings"

+	"github.com/dave-gray101/v2keyauth"
 	"github.com/mudler/LocalAI/pkg/utils"

 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"

 	"github.com/mudler/LocalAI/core/config"
@@ -29,24 +31,6 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func readAuthHeader(c *fiber.Ctx) string {
-	authHeader := c.Get("Authorization")
-
-	// elevenlabs
-	xApiKey := c.Get("xi-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	// anthropic
-	xApiKey = c.Get("x-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	return authHeader
-}
-
 // Embed a directory
 //
 //go:embed static/*
@@ -137,37 +121,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		})
 	}

-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
-	auth := func(c *fiber.Ctx) error {
-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
+ // Health Checks should always be exempt from auth, so register these first
+	routes.HealthRoutes(app)

-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := readAuthHeader(c)
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-
-		// If it's a bearer token
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range appConfig.ApiKeys {
-			if apiKey == key {
-				return c.Next()
-			}
-		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
+	if err != nil || kaConfig == nil {
+		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}

+	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
+	app.Use(v2keyauth.New(*kaConfig))
+
 	if appConfig.CORS {
 		var c func(ctx *fiber.Ctx) error
 		if appConfig.CORSAllowOrigins == "" {
@@ -192,13 +156,13 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)

-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
+	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
+	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
 	if !appConfig.DisableWebUI {
-		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
+		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterJINARoutes(app, cl, ml, appConfig)

 	httpFS := http.FS(embedDirStatic)

--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -31,6 +31,9 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )

+const apiKey = "joshua"
+const bearerKey = "Bearer " + apiKey
+
 const testPrompt = `### System:
 You are an AI assistant that follows instruction extremely well. Help as much as you can.

@@ -50,11 +53,19 @@ type modelApplyRequest struct {

 func getModelStatus(url string) (response map[string]interface{}) {
 	// Create the HTTP request
-	resp, err := http.Get(url)
+	req, err := http.NewRequest("GET", url, nil)
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)
 	if err != nil {
 		fmt.Println("Error creating request:", err)
 		return
 	}
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		fmt.Println("Error sending request:", err)
+		return
+	}
 	defer resp.Body.Close()

 	body, err := io.ReadAll(resp.Body)
@@ -72,14 +83,15 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	return
 }

-func getModels(url string) (response []gallery.GalleryModel) {
+func getModels(url string) ([]gallery.GalleryModel, error) {
+	response := []gallery.GalleryModel{}
 	uri := downloader.URI(url)
 	// TODO: No tests currently seem to exercise file:// urls. Fix?
-	uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+	err := uri.DownloadWithAuthorizationAndCallback("", bearerKey, func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
-	return
+	return response, err
 }

 func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
@@ -101,6 +113,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 		return
 	}
 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	// Make the request
 	client := &http.Client{}
@@ -140,6 +153,7 @@ func postRequestJSON[B any](url string, bodyJson *B) error {
 	}

 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -175,6 +189,7 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	}

 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -195,6 +210,35 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	return json.Unmarshal(body, respJson)
 }

+func postInvalidRequest(url string) (error, int) {
+
+	req, err := http.NewRequest("POST", url, bytes.NewBufferString("invalid request"))
+	if err != nil {
+		return err, -1
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err, -1
+	}
+
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err, -1
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body)), resp.StatusCode
+	}
+
+	return nil, resp.StatusCode
+}
+
 //go:embed backend-assets/*
 var backendAssets embed.FS

@@ -260,6 +304,7 @@ var _ = Describe("API test", func() {
 					config.WithContext(c),
 					config.WithGalleries(galleries),
 					config.WithModelPath(modelDir),
+					config.WithApiKeys([]string{apiKey}),
 					config.WithBackendAssets(backendAssets),
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())
@@ -269,7 +314,7 @@ var _ = Describe("API test", func() {

 			go app.Listen("127.0.0.1:9090")

-			defaultConfig := openai.DefaultConfig("")
+			defaultConfig := openai.DefaultConfig(apiKey)
 			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"

 			client2 = openaigo.NewClient("")
@@ -295,10 +340,19 @@ var _ = Describe("API test", func() {
 			Expect(err).To(HaveOccurred())
 		})

+		Context("Auth Tests", func() {
+			It("Should fail if the api key is missing", func() {
+				err, sc := postInvalidRequest("http://127.0.0.1:9090/models/available")
+				Expect(err).ToNot(BeNil())
+				Expect(sc).To(Equal(403))
+			})
+		})
+
 		Context("Applying models", func() {

 			It("applies models from a gallery", func() {
-				models := getModels("http://127.0.0.1:9090/models/available")
+				models, err := getModels("http://127.0.0.1:9090/models/available")
+				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
 				Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
@@ -331,7 +385,8 @@ var _ = Describe("API test", func() {
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 				Expect(content["foo"]).To(Equal("bar"))

-				models = getModels("http://127.0.0.1:9090/models/available")
+				models, err = getModels("http://127.0.0.1:9090/models/available")
+				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
 				Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -19,14 +19,16 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
-
+	if ctx.Query("model") != "" {
+		modelInput = ctx.Query("model")
+	}
 	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
+	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)

 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := services.ListModels(cl, loader, "", true)
+		models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(models) > 0 {
 			modelInput = models[0]
 			log.Debug().Msgf("No model specified, using: %s", modelInput)
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -55,7 +55,7 @@ func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 		}

 		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
+		filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -45,13 +45,13 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
-
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
+
 		log.Debug().Msgf("Request for model: %s", modelFile)

 		if input.Backend != "" {
@@ -64,7 +64,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			Documents: req.Documents,
 		}

-		results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
+		results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/get_token_metrics.go
+++ b/core/http/endpoints/localai/get_token_metrics.go
@@ -0,0 +1,60 @@
+package localai
+
+import (
+	"github.com/gofiber/fiber/v2"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/rs/zerolog/log"
+
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
+//
+//	@Summary	Get TokenMetrics for Active Slot.
+//	@Accept json
+//	@Produce audio/x-wav
+//	@Success	200		{string}	binary				"generated audio/wav file"
+//	@Router		/v1/tokenMetrics [get]
+//	@Router		/tokenMetrics [get]
+func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(schema.TokenMetricsRequest)
+
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		}
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
+		if err != nil {
+			log.Err(err)
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		} else {
+			modelFile = cfg.Model
+		}
+		log.Debug().Msgf("Token Metrics for model: %s", modelFile)
+
+		response, err := backend.TokenMetrics(modelFile, ml, appConfig, *cfg)
+		if err != nil {
+			return err
+		}
+		return c.JSON(response)
+	}
+}
--- a/core/http/endpoints/localai/system.go
+++ b/core/http/endpoints/localai/system.go
@@ -17,12 +17,14 @@ func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
+		loadedModels := ml.ListModels()
 		for b := range appConfig.ExternalGRPCBackends {
 			availableBackends = append(availableBackends, b)
 		}
 		return c.JSON(
 			schema.SystemInformationResponse{
 				Backends: availableBackends,
+				Models:   loadedModels,
 			},
 		)
 	}
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@@ -0,0 +1,58 @@
+package localai
+
+import (
+	"github.com/gofiber/fiber/v2"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
+)
+
+// TokenizeEndpoint exposes a REST API to tokenize the content
+// @Summary Tokenize the input.
+// @Success 200 {object} schema.TokenizeResponse "Response"
+// @Router /v1/tokenize [post]
+func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(schema.TokenizeRequest)
+
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		}
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
+		if err != nil {
+			log.Err(err)
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		} else {
+			modelFile = cfg.Model
+		}
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
+		if err != nil {
+			return err
+		}
+
+		c.JSON(tokenResponse)
+		return nil
+
+	}
+}
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -13,7 +13,7 @@ import (
 func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 	cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		models, _ := services.ListModels(cl, ml, "", true)
+		models, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		backendConfigs := cl.GetAllBackendConfigs()

 		galleryConfigs := map[string]*gallery.Config{}
@@ -32,18 +32,10 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		// Get model statuses to display in the UI the operation in progress
 		processingModels, taskTypes := modelStatus()

-		modelsWithoutConfig := []string{}
-
-		for _, m := range models {
-			if _, ok := modelsWithBackendConfig[m]; !ok {
-				modelsWithoutConfig = append(modelsWithoutConfig, m)
-			}
-		}
-
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
-			"Models":            modelsWithoutConfig,
+			"Models":            models,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
 			"IsP2PEnabled":      p2p.IsP2PEnabled(),
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -225,7 +225,7 @@ func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {

 func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
 	found = false
-	models, err := services.ListModels(cl, ml, "", true)
+	models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 	if err != nil {
 		return
 	}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -161,6 +161,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
+		// Set CorrelationID
+		correlationID := c.Get("X-Correlation-ID")
+		if len(strings.TrimSpace(correlationID)) == 0 {
+			correlationID = id
+		}
+		c.Set("X-Correlation-ID", correlationID)

 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
@@ -444,6 +450,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
+			c.Set("X-Correlation-ID", id)

 			responses := make(chan schema.OpenAIResponse)

@@ -640,8 +647,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 	for _, m := range input.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range input.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range input.Messages {
+		audios = append(audios, m.StringAudios...)
+	}

-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -57,6 +57,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 	}

 	return func(c *fiber.Ctx) error {
+		// Add Correlation
+		c.Set("X-Correlation-ID", id)
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -27,9 +27,17 @@ func ComputeChoices(
 	for _, m := range req.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range req.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range req.Messages {
+		audios = append(audios, m.StringAudios...)
+	}

 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -18,32 +18,32 @@ func ListModelsEndpoint(bcl *config.BackendConfigLoader, ml *model.ModelLoader)
 		filter := c.Query("filter")

 		// By default, exclude any loose files that are already referenced by a configuration file.
-		excludeConfigured := c.QueryBool("excludeConfigured", true)
+		var policy services.LooseFilePolicy
+		if c.QueryBool("excludeConfigured", true) {
+			policy = services.SKIP_IF_CONFIGURED
+		} else {
+			policy = services.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
+		}

-		dataModels, err := modelList(bcl, ml, filter, excludeConfigured)
+		filterFn, err := config.BuildNameFilterFn(filter)
 		if err != nil {
 			return err
 		}
+
+		modelNames, err := services.ListModels(bcl, ml, filterFn, policy)
+		if err != nil {
+			return err
+		}
+
+		// Map from a slice of names to a slice of OpenAIModel response objects
+		dataModels := []schema.OpenAIModel{}
+		for _, m := range modelNames {
+			dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
+		}
+
 		return c.JSON(schema.ModelsDataResponse{
 			Object: "list",
 			Data:   dataModels,
 		})
 	}
 }
-
-func modelList(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]schema.OpenAIModel, error) {
-
-	models, err := services.ListModels(bcl, ml, filter, excludeConfigured)
-	if err != nil {
-		return nil, err
-	}
-
-	dataModels := []schema.OpenAIModel{}
-
-	// Then iterate through the loose files:
-	for _, m := range models {
-		dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
-	}
-
-	return dataModels, nil
-}
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -6,15 +6,22 @@ import (
 	"fmt"

 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 )

+type correlationIDKeyType string
+
+// CorrelationIDKey to track request across process boundary
+const CorrelationIDKey correlationIDKeyType = "correlationID"
+
 func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
 	input := new(schema.OpenAIRequest)

@@ -24,9 +31,14 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
 	}

 	received, _ := json.Marshal(input)
+	// Extract or generate the correlation ID
+	correlationID := c.Get("X-Correlation-ID", uuid.New().String())

 	ctx, cancel := context.WithCancel(o.Context)
-	input.Context = ctx
+	// Add the correlation ID to the new context
+	ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
+
+	input.Context = ctxWithCorrelationID
 	input.Cancel = cancel

 	log.Debug().Msgf("Request received: %s", string(received))
@@ -135,7 +147,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 	}

 	// Decode each request's message content
-	index := 0
+	imgIndex, vidIndex, audioIndex := 0, 0, 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
@@ -144,20 +156,58 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
+		CONTENT:
 			for _, pp := range c {
-				if pp.Type == "text" {
+				switch pp.Type {
+				case "text":
 					input.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
-					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
-					if err == nil {
-						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-						// set a placeholder for each image
-						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
-						index++
-					} else {
-						log.Error().Msgf("Failed encoding image: %s", err)
+				case "video", "video_url":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding video: %s", err)
+						continue CONTENT
 					}
+					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
+
+					t := "[vid-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Video != "" {
+						t = config.TemplateConfig.Video
+					}
+					// set a placeholder for each image
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
+					vidIndex++
+				case "audio_url", "audio":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					t := "[audio-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Audio != "" {
+						t = config.TemplateConfig.Audio
+					}
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
+					audioIndex++
+				case "image_url", "image":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+
+					t := "[img-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Image != "" {
+						t = config.TemplateConfig.Image
+					}
+					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
+					imgIndex++
 				}
 			}
 		}
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -0,0 +1,94 @@
+package middleware
+
+import (
+	"crypto/subtle"
+	"errors"
+
+	"github.com/dave-gray101/v2keyauth"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/keyauth"
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// This file contains the configuration generators and handler functions that are used along with the fiber/keyauth middleware
+// Currently this requires an upstream patch - and feature patches are no longer accepted to v2
+// Therefore `dave-gray101/v2keyauth` contains the v2 backport of the middleware until v3 stabilizes and we migrate.
+
+func GetKeyAuthConfig(applicationConfig *config.ApplicationConfig) (*v2keyauth.Config, error) {
+	customLookup, err := v2keyauth.MultipleKeySourceLookup([]string{"header:Authorization", "header:x-api-key", "header:xi-api-key"}, keyauth.ConfigDefault.AuthScheme)
+	if err != nil {
+		return nil, err
+	}
+
+	return &v2keyauth.Config{
+		CustomKeyLookup: customLookup,
+		Next:            getApiKeyRequiredFilterFunction(applicationConfig),
+		Validator:       getApiKeyValidationFunction(applicationConfig),
+		ErrorHandler:    getApiKeyErrorHandler(applicationConfig),
+		AuthScheme:      "Bearer",
+	}, nil
+}
+
+func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.ErrorHandler {
+	return func(ctx *fiber.Ctx, err error) error {
+		if errors.Is(err, v2keyauth.ErrMissingOrMalformedAPIKey) {
+			if len(applicationConfig.ApiKeys) == 0 {
+				return ctx.Next() // if no keys are set up, any error we get here is not an error.
+			}
+			if applicationConfig.OpaqueErrors {
+				return ctx.SendStatus(403)
+			}
+			return ctx.Status(403).SendString(err.Error())
+		}
+		if applicationConfig.OpaqueErrors {
+			return ctx.SendStatus(500)
+		}
+		return err
+	}
+}
+
+func getApiKeyValidationFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx, string) (bool, error) {
+
+	if applicationConfig.UseSubtleKeyComparison {
+		return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
+			if len(applicationConfig.ApiKeys) == 0 {
+				return true, nil // If no keys are setup, accept everything
+			}
+			for _, validKey := range applicationConfig.ApiKeys {
+				if subtle.ConstantTimeCompare([]byte(apiKey), []byte(validKey)) == 1 {
+					return true, nil
+				}
+			}
+			return false, v2keyauth.ErrMissingOrMalformedAPIKey
+		}
+	}
+
+	return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
+		if len(applicationConfig.ApiKeys) == 0 {
+			return true, nil // If no keys are setup, accept everything
+		}
+		for _, validKey := range applicationConfig.ApiKeys {
+			if apiKey == validKey {
+				return true, nil
+			}
+		}
+		return false, v2keyauth.ErrMissingOrMalformedAPIKey
+	}
+}
+
+func getApiKeyRequiredFilterFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx) bool {
+	if applicationConfig.DisableApiKeyRequirementForHttpGet {
+		return func(c *fiber.Ctx) bool {
+			if c.Method() != "GET" {
+				return false
+			}
+			for _, rx := range applicationConfig.HttpGetExemptedEndpoints {
+				if rx.MatchString(c.Path()) {
+					return true
+				}
+			}
+			return false
+		}
+	}
+	return func(c *fiber.Ctx) bool { return false }
+}
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -10,12 +10,11 @@ import (
 func RegisterElevenLabsRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {

 	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/text-to-speech/:voice-id", elevenlabs.TTSEndpoint(cl, ml, appConfig))

-	app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
+	app.Post("/v1/sound-generation", elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))

 }
--- a/core/http/routes/health.go
+++ b/core/http/routes/health.go
@@ -0,0 +1,13 @@
+package routes
+
+import "github.com/gofiber/fiber/v2"
+
+func HealthRoutes(app *fiber.App) {
+	// Service health checks
+	ok := func(c *fiber.Ctx) error {
+		return c.SendStatus(200)
+	}
+
+	app.Get("/healthz", ok)
+	app.Get("/readyz", ok)
+}
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -11,8 +11,7 @@ import (
 func RegisterJINARoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {

 	// POST endpoint to mimic the reranking
 	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -15,61 +15,55 @@ func RegisterLocalAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
+	galleryService *services.GalleryService) {

 	app.Get("/swagger/*", swagger.HandlerDefault) // default

 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
 		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-		app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-		app.Post("/models/delete/:name", auth, modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+		app.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+		app.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())

-		app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-		app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
-		app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
-		app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-		app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
-		app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+		app.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+		app.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
+		app.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
+		app.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+		app.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
+		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}

-	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))

 	// Stores
 	sl := model.NewModelLoader("")
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+	app.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))

-	// Kubernetes health checks
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
+	app.Get("/metrics", localai.LocalAIMetricsEndpoint())

 	// Experimental Backend Statistics Module
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitorService))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitorService))
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))

 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", auth, localai.ShowP2PNodes(appConfig))
-		app.Get("/api/p2p/token", auth, localai.ShowP2PToken(appConfig))
+		app.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
+		app.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
 	}

-	app.Get("/version", auth, func(c *fiber.Ctx) error {
+	app.Get("/version", func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Get("/system", auth, localai.SystemInformations(ml, appConfig))
+	app.Get("/system", localai.SystemInformations(ml, appConfig))
+
+	// misc
+	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))

 }
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -11,66 +11,65 @@ import (
 func RegisterOpenAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {
 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))

 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))

 	// assistant
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))

 	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Post("/v1/files", openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/files", openai.UploadFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files", openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/files", openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
+	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))

 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))

 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))

 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))

 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))

 	if appConfig.ImageDir != "" {
 		app.Static("/generated-images", appConfig.ImageDir)
@@ -81,6 +80,6 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	}

 	// List models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", openai.ListModelsEndpoint(cl, ml))
+	app.Get("/models", openai.ListModelsEndpoint(cl, ml))
 }
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -59,8 +59,7 @@ func RegisterUIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
+	galleryService *services.GalleryService) {

 	// keeps the state of models that are being installed from the UI
 	var processingModels = NewModelOpCache()
@@ -85,10 +84,10 @@ func RegisterUIRoutes(app *fiber.App,
 		return processingModelsData, taskTypes
 	}

-	app.Get("/", auth, localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))
+	app.Get("/", localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))

 	if p2p.IsP2PEnabled() {
-		app.Get("/p2p", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p", func(c *fiber.Ctx) error {
 			summary := fiber.Map{
 				"Title":   "LocalAI - P2P dashboard",
 				"Version": internal.PrintableVersion(),
@@ -104,17 +103,17 @@ func RegisterUIRoutes(app *fiber.App,
 		})

 		/* show nodes live! */
-		app.Get("/p2p/ui/workers", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})

-		app.Get("/p2p/ui/workers-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})
 	}
@@ -122,7 +121,7 @@ func RegisterUIRoutes(app *fiber.App,
 	if !appConfig.DisableGalleryEndpoint {

 		// Show the Models page (all models)
-		app.Get("/browse", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse", func(c *fiber.Ctx) error {
 			term := c.Query("term")

 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
@@ -167,7 +166,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// Show the models, filtered from the user input
 		// https://htmx.org/examples/active-search/
-		app.Post("/browse/search/models", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/search/models", func(c *fiber.Ctx) error {
 			form := struct {
 				Search string `form:"search"`
 			}{}
@@ -188,7 +187,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/install/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/install/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to install  : %+v\n", galleryID)

@@ -215,7 +214,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/delete/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/delete/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to delete  : %+v\n", galleryID)
 			var galleryName = galleryID
@@ -255,7 +254,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// Display the job current progress status
 		// If the job is done, we trigger the /browse/job/:uid route
 		// https://htmx.org/examples/progress-bar/
-		app.Get("/browse/job/progress/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/progress/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!

 			status := galleryService.GetStatus(jobUID)
@@ -279,7 +278,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// this route is hit when the job is done, and we display the
 		// final state (for now just displays "Installation completed")
-		app.Get("/browse/job/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!

 			status := galleryService.GetStatus(jobUID)
@@ -303,8 +302,8 @@ func RegisterUIRoutes(app *fiber.App,
 	}

 	// Show the Chat page
-	app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+	app.Get("/chat/:model", func(c *fiber.Ctx) error {
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)

 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
@@ -318,8 +317,8 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})

-	app.Get("/talk/", auth, func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+	app.Get("/talk/", func(c *fiber.Ctx) error {
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -338,9 +337,9 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/talk", summary)
 	})

-	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
+	app.Get("/chat/", func(c *fiber.Ctx) error {

-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -359,7 +358,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})

-	app.Get("/text2image/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()

 		summary := fiber.Map{
@@ -374,7 +373,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})

-	app.Get("/text2image/", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/", func(c *fiber.Ctx) error {

 		backendConfigs := cl.GetAllBackendConfigs()

@@ -395,7 +394,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})

-	app.Get("/tts/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()

 		summary := fiber.Map{
@@ -410,7 +409,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/tts", summary)
 	})

-	app.Get("/tts/", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/", func(c *fiber.Ctx) error {

 		backendConfigs := cl.GetAllBackendConfigs()

--- a/core/p2p/federated_server.go
+++ b/core/p2p/federated_server.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"io"
 	"net"

 	"github.com/mudler/edgevpn/pkg/node"
@@ -41,7 +42,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		log.Error().Err(err).Msg("Error listening")
 		return err
 	}
-	//	ll.Info("Binding local port on", srcaddr)
+
 	go func() {
 		<-ctx.Done()
 		l.Close()
@@ -82,6 +83,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {

 				if workerID == "" {
 					log.Error().Msg("No available nodes yet")
+					fs.sendHTMLResponse(conn, 503, "Sorry, waiting for nodes to connect")
 					return
 				}

@@ -89,6 +91,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 				nodeData, exists := GetNode(fs.service, workerID)
 				if !exists {
 					log.Error().Msgf("Node %s not found", workerID)
+					fs.sendHTMLResponse(conn, 404, "Node not found")
 					return
 				}

@@ -100,3 +103,42 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		}
 	}
 }
+
+// sendHTMLResponse sends a basic HTML response with a status code and a message.
+// This is extracted to make the HTML content maintainable.
+func (fs *FederatedServer) sendHTMLResponse(conn net.Conn, statusCode int, message string) {
+	defer conn.Close()
+
+	// Define the HTML content separately for easier maintenance.
+	htmlContent := fmt.Sprintf("<html><body><h1>%s</h1></body></html>\r\n", message)
+
+	// Create the HTTP response with dynamic status code and content.
+	response := fmt.Sprintf(
+		"HTTP/1.1 %d %s\r\n"+
+			"Content-Type: text/html\r\n"+
+			"Connection: close\r\n"+
+			"\r\n"+
+			"%s",
+		statusCode, getHTTPStatusText(statusCode), htmlContent,
+	)
+
+	// Write the response to the client connection.
+	_, writeErr := io.WriteString(conn, response)
+	if writeErr != nil {
+		log.Error().Err(writeErr).Msg("Error writing response to client")
+	}
+}
+
+// getHTTPStatusText returns a textual representation of HTTP status codes.
+func getHTTPStatusText(statusCode int) string {
+	switch statusCode {
+	case 503:
+		return "Service Unavailable"
+	case 404:
+		return "Not Found"
+	case 200:
+		return "OK"
+	default:
+		return "Unknown Status"
+	}
+}
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -2,6 +2,7 @@ package schema

 import (
 	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/LocalAI/pkg/model"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )

@@ -9,6 +10,10 @@ type BackendMonitorRequest struct {
 	Model string `json:"model" yaml:"model"`
 }

+type TokenMetricsRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
 type BackendMonitorResponse struct {
 	MemoryInfo    *gopsutil.MemoryInfoStat
 	MemoryPercent float32
@@ -72,5 +77,6 @@ type P2PNodesResponse struct {
 }

 type SystemInformationResponse struct {
-	Backends []string `json:"backends"`
+	Backends []string      `json:"backends"`
+	Models   []model.Model `json:"loaded_models"`
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -58,6 +58,8 @@ type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+	AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
+	VideoURL ContentURL `json:"video_url" yaml:"video_url"`
 }

 type ContentURL struct {
@@ -76,6 +78,8 @@ type Message struct {

 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+	StringVideos  []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
+	StringAudios  []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`

 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
--- a/core/schema/tokenize.go
+++ b/core/schema/tokenize.go
@@ -0,0 +1,10 @@
+package schema
+
+type TokenizeRequest struct {
+	Content string `json:"content"`
+	Model   string `json:"model"`
+}
+
+type TokenizeResponse struct {
+	Tokens []int32 `json:"tokens"`
+}
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -1,55 +1,47 @@
 package services

 import (
-	"regexp"
-
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/model"
 )

-func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]string, error) {
+type LooseFilePolicy int

-	models, err := ml.ListFilesInModelPath()
-	if err != nil {
-		return nil, err
-	}
+const (
+	SKIP_IF_CONFIGURED LooseFilePolicy = iota
+	SKIP_ALWAYS
+	ALWAYS_INCLUDE
+	LOOSE_ONLY
+)

-	var mm map[string]interface{} = map[string]interface{}{}
+func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
+
+	var skipMap map[string]interface{} = map[string]interface{}{}

 	dataModels := []string{}

-	var filterFn func(name string) bool
-
-	// If filter is not specified, do not filter the list by model name
-	if filter == "" {
-		filterFn = func(_ string) bool { return true }
-	} else {
-		// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
-		rxp, err := regexp.Compile(filter)
-		if err != nil {
-			return nil, err
-		}
-		filterFn = func(name string) bool {
-			return rxp.MatchString(name)
-		}
-	}
-
-	// Start with the known configurations
-	for _, c := range bcl.GetAllBackendConfigs() {
-		if excludeConfigured {
-			mm[c.Model] = nil
-		}
-
-		if filterFn(c.Name) {
+	// Start with known configurations
+	if looseFilePolicy != LOOSE_ONLY {
+		for _, c := range bcl.GetBackendConfigsByFilter(filter) {
+			if looseFilePolicy == SKIP_IF_CONFIGURED {
+				skipMap[c.Model] = nil
+			}
 			dataModels = append(dataModels, c.Name)
 		}
 	}

-	// Then iterate through the loose files:
-	for _, m := range models {
-		// And only adds them if they shouldn't be skipped.
-		if _, exists := mm[m]; !exists && filterFn(m) {
-			dataModels = append(dataModels, m)
+	// Then iterate through the loose files if requested.
+	if looseFilePolicy != SKIP_ALWAYS {
+
+		models, err := ml.ListFilesInModelPath()
+		if err != nil {
+			return nil, err
+		}
+		for _, m := range models {
+			// And only adds them if they shouldn't be skipped.
+			if _, exists := skipMap[m]; !exists && filter(m, nil) {
+				dataModels = append(dataModels, m)
+			}
 		}
 	}

--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -1,206 +1,237 @@
-package startup
-
-import (
-	"fmt"
-	"os"
-
-	"github.com/mudler/LocalAI/core"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/internal"
-	"github.com/mudler/LocalAI/pkg/assets"
-	"github.com/mudler/LocalAI/pkg/library"
-	"github.com/mudler/LocalAI/pkg/model"
-	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
-	"github.com/mudler/LocalAI/pkg/xsysinfo"
-	"github.com/rs/zerolog/log"
-)
-
-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
-	options := config.NewApplicationConfig(opts...)
-
-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-	caps, err := xsysinfo.CPUCapabilities()
-	if err == nil {
-		log.Debug().Msgf("CPU capabilities: %v", caps)
-	}
-	gpus, err := xsysinfo.GPUs()
-	if err == nil {
-		log.Debug().Msgf("GPU count: %d", len(gpus))
-		for _, gpu := range gpus {
-			log.Debug().Msgf("GPU: %s", gpu.String())
-		}
-	}
-
-	// Make sure directories exists
-	if options.ModelPath == "" {
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
-	}
-	err = os.MkdirAll(options.ModelPath, 0750)
-	if err != nil {
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
-	}
-	if options.ImageDir != "" {
-		err := os.MkdirAll(options.ImageDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
-		}
-	}
-	if options.AudioDir != "" {
-		err := os.MkdirAll(options.AudioDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
-		}
-	}
-	if options.UploadDir != "" {
-		err := os.MkdirAll(options.UploadDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
-		}
-	}
-
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
-		log.Error().Err(err).Msg("error installing models")
-	}
-
-	cl := config.NewBackendConfigLoader(options.ModelPath)
-	ml := model.NewModelLoader(options.ModelPath)
-
-	configLoaderOpts := options.ToConfigLoaderOptions()
-
-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
-		log.Error().Err(err).Msg("error loading config files")
-	}
-
-	if options.ConfigFile != "" {
-		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
-			log.Error().Err(err).Msg("error loading config file")
-		}
-	}
-
-	if err := cl.Preload(options.ModelPath); err != nil {
-		log.Error().Err(err).Msg("error downloading models")
-	}
-
-	if options.PreloadJSONModels != "" {
-		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cl.GetAllBackendConfigs() {
-			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
-		}
-	}
-
-	if options.LibPath != "" {
-		// If there is a lib directory, set LD_LIBRARY_PATH to include it
-		err := library.LoadExternal(options.LibPath)
-		if err != nil {
-			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
-		}
-	}
-
-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		err := ml.StopAllGRPC()
-		if err != nil {
-			log.Error().Err(err).Msg("error while stopping all grpc backends")
-		}
-	}()
-
-	if options.WatchDog {
-		wd := model.NewWatchDog(
-			ml,
-			options.WatchDogBusyTimeout,
-			options.WatchDogIdleTimeout,
-			options.WatchDogBusy,
-			options.WatchDogIdle)
-		ml.SetWatchDog(wd)
-		go wd.Run()
-		go func() {
-			<-options.Context.Done()
-			log.Debug().Msgf("Context canceled, shutting down")
-			wd.Shutdown()
-		}()
-	}
-
-	// Watch the configuration directory
-	startWatcher(options)
-
-	log.Info().Msg("core/startup process completed!")
-	return cl, ml, options, nil
-}
-
-func startWatcher(options *config.ApplicationConfig) {
-	if options.DynamicConfigsDir == "" {
-		// No need to start the watcher if the directory is not set
-		return
-	}
-
-	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
-		if os.IsNotExist(err) {
-			// We try to create the directory if it does not exist and was specified
-			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
-				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
-			}
-		} else {
-			// something else happened, we log the error and don't start the watcher
-			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
-			return
-		}
-	}
-
-	configHandler := newConfigFileHandler(options)
-	if err := configHandler.Watch(); err != nil {
-		log.Error().Err(err).Msg("failed creating watcher")
-	}
-}
-
-// In Lieu of a proper DI framework, this function wires up the Application manually.
-// This is in core/startup rather than core/state.go to keep package references clean!
-func createApplication(appConfig *config.ApplicationConfig) *core.Application {
-	app := &core.Application{
-		ApplicationConfig:   appConfig,
-		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
-		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
-	}
-
-	var err error
-
-	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-
-	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
-	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
-
-	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
-	if err != nil {
-		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
-	}
-
-	return app
-}
+package startup
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/mudler/LocalAI/core"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/assets"
+	"github.com/mudler/LocalAI/pkg/library"
+	"github.com/mudler/LocalAI/pkg/model"
+	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/rs/zerolog/log"
+)
+
+func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+	options := config.NewApplicationConfig(opts...)
+
+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+	caps, err := xsysinfo.CPUCapabilities()
+	if err == nil {
+		log.Debug().Msgf("CPU capabilities: %v", caps)
+	}
+	gpus, err := xsysinfo.GPUs()
+	if err == nil {
+		log.Debug().Msgf("GPU count: %d", len(gpus))
+		for _, gpu := range gpus {
+			log.Debug().Msgf("GPU: %s", gpu.String())
+		}
+	}
+
+	// Make sure directories exists
+	if options.ModelPath == "" {
+		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+	}
+	err = os.MkdirAll(options.ModelPath, 0750)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+	}
+	if options.ImageDir != "" {
+		err := os.MkdirAll(options.ImageDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+		}
+	}
+	if options.AudioDir != "" {
+		err := os.MkdirAll(options.AudioDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+		}
+	}
+	if options.UploadDir != "" {
+		err := os.MkdirAll(options.UploadDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+		}
+	}
+
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+		log.Error().Err(err).Msg("error installing models")
+	}
+
+	cl := config.NewBackendConfigLoader(options.ModelPath)
+	ml := model.NewModelLoader(options.ModelPath)
+
+	configLoaderOpts := options.ToConfigLoaderOptions()
+
+	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+		log.Error().Err(err).Msg("error loading config files")
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
+			log.Error().Err(err).Msg("error loading config file")
+		}
+	}
+
+	if err := cl.Preload(options.ModelPath); err != nil {
+		log.Error().Err(err).Msg("error downloading models")
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
+			return nil, nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
+			return nil, nil, nil, err
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.GetAllBackendConfigs() {
+			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
+		}
+	}
+
+	if options.LibPath != "" {
+		// If there is a lib directory, set LD_LIBRARY_PATH to include it
+		err := library.LoadExternal(options.LibPath)
+		if err != nil {
+			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		err := ml.StopAllGRPC()
+		if err != nil {
+			log.Error().Err(err).Msg("error while stopping all grpc backends")
+		}
+	}()
+
+	if options.WatchDog {
+		wd := model.NewWatchDog(
+			ml,
+			options.WatchDogBusyTimeout,
+			options.WatchDogIdleTimeout,
+			options.WatchDogBusy,
+			options.WatchDogIdle)
+		ml.SetWatchDog(wd)
+		go wd.Run()
+		go func() {
+			<-options.Context.Done()
+			log.Debug().Msgf("Context canceled, shutting down")
+			wd.Shutdown()
+		}()
+	}
+
+	if options.LoadToMemory != nil {
+		for _, m := range options.LoadToMemory {
+			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
+				config.LoadOptionDebug(options.Debug),
+				config.LoadOptionThreads(options.Threads),
+				config.LoadOptionContextSize(options.ContextSize),
+				config.LoadOptionF16(options.F16),
+				config.ModelPath(options.ModelPath),
+			)
+			if err != nil {
+				return nil, nil, nil, err
+			}
+
+			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
+
+			o := backend.ModelOptions(*cfg, options, []model.Option{})
+
+			var backendErr error
+			if cfg.Backend != "" {
+				o = append(o, model.WithBackendString(cfg.Backend))
+				_, backendErr = ml.BackendLoader(o...)
+			} else {
+				_, backendErr = ml.GreedyLoader(o...)
+			}
+			if backendErr != nil {
+				return nil, nil, nil, err
+			}
+		}
+	}
+
+	// Watch the configuration directory
+	startWatcher(options)
+
+	log.Info().Msg("core/startup process completed!")
+	return cl, ml, options, nil
+}
+
+func startWatcher(options *config.ApplicationConfig) {
+	if options.DynamicConfigsDir == "" {
+		// No need to start the watcher if the directory is not set
+		return
+	}
+
+	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
+		if os.IsNotExist(err) {
+			// We try to create the directory if it does not exist and was specified
+			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
+				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
+			}
+		} else {
+			// something else happened, we log the error and don't start the watcher
+			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
+			return
+		}
+	}
+
+	configHandler := newConfigFileHandler(options)
+	if err := configHandler.Watch(); err != nil {
+		log.Error().Err(err).Msg("failed creating watcher")
+	}
+}
+
+// In Lieu of a proper DI framework, this function wires up the Application manually.
+// This is in core/startup rather than core/state.go to keep package references clean!
+func createApplication(appConfig *config.ApplicationConfig) *core.Application {
+	app := &core.Application{
+		ApplicationConfig:   appConfig,
+		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
+		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
+	}
+
+	var err error
+
+	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+
+	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
+	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
+
+	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
+	if err != nil {
+		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
+	}
+
+	return app
+}
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with

 Images with `core` in the tag are smaller and do not contain any python dependencies. 

-{{< tabs tabTotal="6" >}}
+{{< tabs tabTotal="7" >}}
 {{% tab tabName="Vanilla / CPU Images" %}}

 | Description | Quay | Docker Hub                                   |
@@ -227,6 +227,15 @@ Images with `core` in the tag are smaller and do not contain any python dependen

 {{% /tab %}}

+
+{{% tab tabName="Vulkan Images" %}}
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai: master-vulkan-ffmpeg-core ` | `localai/localai: master-vulkan-ffmpeg-core `                      |
+| Latest tag | `quay.io/go-skynet/local-ai: latest-vulkan-ffmpeg-core ` | `localai/localai: latest-vulkan-ffmpeg-core`                 |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core`             |
+{{% /tab %}}
+
 {{< /tabs >}}

 ## See Also
--- a/Show More
+++ b/Show More