chore(deps): Bump chromadb from 0.5.5 to 0.5.7 in /examples/langchain-chroma (#3640 )

chore(deps): Bump chromadb in /examples/langchain-chroma Bumps [chromadb](https://github.com/chroma-core/chroma) from 0.5.5 to 0.5.7. - [Release notes](https://github.com/chroma-core/chroma/releases) - [Changelog](https://github.com/chroma-core/chroma/blob/main/RELEASE_PROCESS.md) - [Commits](https://github.com/chroma-core/chroma/compare/0.5.5...0.5.7) --- updated-dependencies: - dependency-name: chromadb dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
chore: ⬆️ Update ggerganov/whisper.cpp to 0d2e2aed80109e8696791083bde3b58e190b7812 (#3658 )
2026-02-03 11:13:31 -05:00 · 2024-09-25 12:44:32 +02:00 · 2024-09-25 08:54:37 +00:00 · 2024-09-25 08:48:13 +02:00 · 2024-09-25 08:47:57 +02:00 · 2024-09-25 08:47:33 +02:00
77 changed files with 1456 additions and 614 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -9,6 +9,7 @@
 # Param 2: email
 #
 config_user() {
+    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
@@ -24,6 +25,7 @@ config_user() {
 # Param 2: remote url
 #
 config_remote() {
+    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    if scan['hasUnsafeFile']:
-        return scan
-    return None
+    # Check if 'hasUnsafeFile' exists in the response
+    if 'hasUnsafeFile' in scan:
+        if scan['hasUnsafeFile']:
+            return scan
+        else:
+            return None
+    else:
+        return None

 download_type, repo_id_or_url = parse_uri(uri)

--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,6 +13,78 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  hipblas-jobs:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: 2
+      matrix:
+        include:
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -39,7 +111,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
@@ -122,29 +194,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -212,26 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/13
+++ b/13
@@ -297,10 +297,10 @@ COPY .git .
 RUN make prepare

 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas build
-## (both will use CUDA for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
+## (both will use CUDA or hipblas for the actual computation)
+RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@@ -338,9 +338,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ssh less && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        ssh less wget
+# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.

 RUN go install github.com/go-delve/delve/cmd/dlv@latest

--- a/7
+++ b/7
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=70392f1f81470607ba3afef04aa56c9f65587664

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=0d2e2aed80109e8696791083bde3b58e190b7812

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -359,6 +359,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

+clean-dc: clean
+	cp -r /build/backend-assets /workspace/backend-assets
+
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -134,6 +134,8 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
+  repeated string Videos = 45;
+  repeated string Audios = 46;
 }

 // The response message containing the result
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,6 +13,7 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
+#include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
@@ -448,7 +449,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }

@@ -462,7 +463,7 @@ struct llama_server_context
        ctx = llama_init.context;
        if (model == nullptr)
        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }

@@ -470,7 +471,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -489,7 +490,7 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
@@ -812,10 +813,11 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERROR("failed to load image", {
-                            {"slot_id",   slot->id},
-                            {"img_sl_id", img_sl.id}
-                        });
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
+                             __func__,
+                             slot->id,
+                             img_sl.id
+                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -853,12 +855,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -886,7 +888,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

        return true;
    }
@@ -926,7 +928,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -938,7 +940,7 @@ struct llama_server_context
            }
        }

-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
        system_need_update = false;
    }

@@ -1120,7 +1122,7 @@ struct llama_server_context
            }

            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                return false;
            }

@@ -1132,7 +1134,7 @@ struct llama_server_context

    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1371,7 +1373,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1389,7 +1391,7 @@ struct llama_server_context
                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1572,7 +1574,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());

                    continue;
                    // END LOCALAI changes
@@ -1820,10 +1822,11 @@ struct llama_server_context

                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERROR("failed processing images", {
-                            "slot_id", slot.id,
-                            "task_id", slot.task_id,
-                        });
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
+                            __func__,
+                            slot.id,
+                            slot.task_id
+                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1863,10 +1866,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;

-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1876,7 +1879,7 @@ struct llama_server_context

                        slot.ga_i += slot.ga_w / slot.ga_n;

-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1901,11 +1904,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-TTS==0.22.0
+coqui-tts
 grpcio==1.66.1
 protobuf
 certifi
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,5 +15,12 @@ installRequirements

 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=$(ls ${MY_DIR}/venv/lib)
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
+PYDIR=python3.10
+pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
+
+if [ ! -d ${pyenv} ]; then
+    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
+    exit 1
+fi
+
+curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/sentencetransformers/backend.py
+++ b/backend/python/sentencetransformers/backend.py
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model
        try:
-            self.model = SentenceTransformer(model_name)
+            self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

--- a/backend/python/sentencetransformers/requirements-cpu.txt
+++ b/backend/python/sentencetransformers/requirements-cpu.txt
@@ -2,5 +2,5 @@ torch
 accelerate
 transformers
 bitsandbytes
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas11.txt
+++ b/backend/python/sentencetransformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas12.txt
+++ b/backend/python/sentencetransformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -4,5 +4,5 @@ torch
 optimum[openvino]
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.1.1
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,5 @@
 grpcio==1.66.1
 protobuf
-certifi
+certifi
+datasets
+einops
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -4,4 +4,4 @@ accelerate
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,4 +4,4 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -12,7 +12,7 @@ import (
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model

-	grpcOpts := gRPCModelOpts(backendConfig)
+	grpcOpts := GRPCModelOpts(backendConfig)

 	var inferenceModel interface{}
 	var err error
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -12,7 +12,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if *threads == 0 && appConfig.Threads != 0 {
 		threads = &appConfig.Threads
 	}
-	gRPCOpts := gRPCModelOpts(backendConfig)
+	gRPCOpts := GRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -31,13 +31,13 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
 		threads = &o.Threads
 	}
-	grpcOpts := gRPCModelOpts(c)
+	grpcOpts := GRPCModelOpts(c)

 	var inferenceModel grpc.Backend
 	var err error
@@ -101,6 +101,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
+		opts.Videos = videos
+		opts.Audios = audios

 		tokenUsage := TokenUsage{}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -44,7 +44,7 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }

-func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -15,7 +15,7 @@ func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *mod
 		return nil, fmt.Errorf("backend is required")
 	}

-	grpcOpts := gRPCModelOpts(backendConfig)
+	grpcOpts := GRPCModelOpts(backendConfig)

 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -29,7 +29,7 @@ func SoundGeneration(
 		return "", nil, fmt.Errorf("backend is a required parameter")
 	}

-	grpcOpts := gRPCModelOpts(backendConfig)
+	grpcOpts := GRPCModelOpts(backendConfig)
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(backend),
 		model.WithModel(modelFile),
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -28,7 +28,7 @@ func ModelTTS(
 		bb = model.PiperBackend
 	}

-	grpcOpts := gRPCModelOpts(backendConfig)
+	grpcOpts := GRPCModelOpts(backendConfig)

 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -41,31 +41,35 @@ type RunCMD struct {
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
 	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

-	Address                string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
-	CORS                   bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
-	CORSAllowOrigins       string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
-	LibraryPath            string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
-	CSRF                   bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
-	UploadLimit            int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
-	APIKeys                []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI           bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
-	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
-	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
-	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
-	Peer2PeerDHTInterval   int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
-	Peer2PeerOTPInterval   int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
-	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
-	Peer2PeerNetworkID     string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
-	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
-	SingleActiveBackend    bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
-	PreloadBackendOnly     bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
-	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
-	EnableWatchdogIdle     bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
-	WatchdogIdleTimeout    string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
-	EnableWatchdogBusy     bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
-	WatchdogBusyTimeout    string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
-	Federated              bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
-	DisableGalleryEndpoint bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
+	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORSAllowOrigins                   string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	LibraryPath                        string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
+	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
+	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
+	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
+	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
+	DisableApiKeyRequirementForHttpGet bool     `env:"LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET" default:"false" help:"If true, a valid API key is not required to issue GET requests to portions of the web ui. This should only be enabled in secure testing environments" group:"hardening"`
+	HttpGetExemptedEndpoints           []string `env:"LOCALAI_HTTP_GET_EXEMPTED_ENDPOINTS" default:"^/$,^/browse/?$,^/talk/?$,^/p2p/?$,^/chat/?$,^/text2image/?$,^/tts/?$,^/static/.*$,^/swagger.*$" help:"If LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET is overriden to true, this is the list of endpoints to exempt. Only adjust this in case of a security incident or as a result of a personal security posture review" group:"hardening"`
+	Peer2Peer                          bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
+	Peer2PeerDHTInterval               int      `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
+	Peer2PeerOTPInterval               int      `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
+	Peer2PeerToken                     string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+	Peer2PeerNetworkID                 string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
+	ParallelRequests                   bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
+	SingleActiveBackend                bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	PreloadBackendOnly                 bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
+	ExternalGRPCBackends               []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
+	EnableWatchdogIdle                 bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
+	WatchdogIdleTimeout                string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
+	EnableWatchdogBusy                 bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
+	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
+	DisableGalleryEndpoint             bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
+	LoadToMemory                       []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
 }

 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -97,7 +101,11 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
 		config.WithOpaqueErrors(r.OpaqueErrors),
 		config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
+		config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
+		config.WithDisableApiKeyRequirementForHttpGet(r.DisableApiKeyRequirementForHttpGet),
+		config.WithHttpGetExemptedEndpoints(r.HttpGetExemptedEndpoints),
 		config.WithP2PNetworkID(r.Peer2PeerNetworkID),
+		config.WithLoadToMemory(r.LoadToMemory),
 	}

 	token := ""
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
+	"regexp"
 	"time"

 	"github.com/mudler/LocalAI/pkg/xsysinfo"
@@ -16,7 +17,6 @@ type ApplicationConfig struct {
 	ModelPath                           string
 	LibPath                             string
 	UploadLimitMB, Threads, ContextSize int
-	DisableWebUI                        bool
 	F16                                 bool
 	Debug                               bool
 	ImageDir                            string
@@ -31,11 +31,18 @@ type ApplicationConfig struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
-	EnforcePredownloadScans             bool
-	OpaqueErrors                        bool
 	P2PToken                            string
 	P2PNetworkID                        string

+	DisableWebUI                       bool
+	EnforcePredownloadScans            bool
+	OpaqueErrors                       bool
+	UseSubtleKeyComparison             bool
+	DisableApiKeyRequirementForHttpGet bool
+	HttpGetExemptedEndpoints           []*regexp.Regexp
+	DisableGalleryEndpoint             bool
+	LoadToMemory                       []string
+
 	ModelLibraryURL string

 	Galleries []Gallery
@@ -57,8 +64,6 @@ type ApplicationConfig struct {
 	ModelsURL []string

 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
-
-	DisableGalleryEndpoint bool
 }

 type AppOption func(*ApplicationConfig)
@@ -327,6 +332,38 @@ func WithOpaqueErrors(opaque bool) AppOption {
 	}
 }

+func WithLoadToMemory(models []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.LoadToMemory = models
+	}
+}
+
+func WithSubtleKeyComparison(subtle bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.UseSubtleKeyComparison = subtle
+	}
+}
+
+func WithDisableApiKeyRequirementForHttpGet(required bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DisableApiKeyRequirementForHttpGet = required
+	}
+}
+
+func WithHttpGetExemptedEndpoints(endpoints []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.HttpGetExemptedEndpoints = []*regexp.Regexp{}
+		for _, epr := range endpoints {
+			r, err := regexp.Compile(epr)
+			if err == nil && r != nil {
+				o.HttpGetExemptedEndpoints = append(o.HttpGetExemptedEndpoints, r)
+			} else {
+				log.Warn().Err(err).Str("regex", epr).Msg("Error while compiling HTTP Get Exemption regex, skipping this entry.")
+			}
+		}
+	}
+}
+
 // ToConfigLoaderOptions returns a slice of ConfigLoader Option.
 // Some options defined at the application level are going to be passed as defaults for
 // all the configuration for the models.
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -132,7 +132,7 @@ func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*Gal
 func findGalleryURLFromReferenceURL(url string, basePath string) (string, error) {
 	var refFile string
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		refFile = string(d)
 		if len(refFile) == 0 {
 			return fmt.Errorf("invalid reference file at url %s: %s", url, d)
@@ -156,7 +156,7 @@ func getGalleryModels(gallery config.Gallery, basePath string) ([]*GalleryModel,
 	}
 	uri := downloader.URI(gallery.URL)

-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &models)
 	})
 	if err != nil {
--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -69,7 +69,7 @@ type PromptTemplate struct {
 func GetGalleryConfigFromURL(url string, basePath string) (Config, error) {
 	var config Config
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &config)
 	})
 	if err != nil {
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -3,13 +3,15 @@ package http
 import (
 	"embed"
 	"errors"
+	"fmt"
 	"net/http"
-	"strings"

+	"github.com/dave-gray101/v2keyauth"
 	"github.com/mudler/LocalAI/pkg/utils"

 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"

 	"github.com/mudler/LocalAI/core/config"
@@ -29,24 +31,6 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func readAuthHeader(c *fiber.Ctx) string {
-	authHeader := c.Get("Authorization")
-
-	// elevenlabs
-	xApiKey := c.Get("xi-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	// anthropic
-	xApiKey = c.Get("x-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	return authHeader
-}
-
 // Embed a directory
 //
 //go:embed static/*
@@ -137,37 +121,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		})
 	}

-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
-	auth := func(c *fiber.Ctx) error {
-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
+ // Health Checks should always be exempt from auth, so register these first
+	routes.HealthRoutes(app)

-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := readAuthHeader(c)
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-
-		// If it's a bearer token
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range appConfig.ApiKeys {
-			if apiKey == key {
-				return c.Next()
-			}
-		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
+	if err != nil || kaConfig == nil {
+		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}

+	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
+	app.Use(v2keyauth.New(*kaConfig))
+
 	if appConfig.CORS {
 		var c func(ctx *fiber.Ctx) error
 		if appConfig.CORSAllowOrigins == "" {
@@ -192,13 +156,13 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)

-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
+	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
+	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
 	if !appConfig.DisableWebUI {
-		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
+		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterJINARoutes(app, cl, ml, appConfig)

 	httpFS := http.FS(embedDirStatic)

--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -31,6 +31,9 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )

+const apiKey = "joshua"
+const bearerKey = "Bearer " + apiKey
+
 const testPrompt = `### System:
 You are an AI assistant that follows instruction extremely well. Help as much as you can.

@@ -50,11 +53,19 @@ type modelApplyRequest struct {

 func getModelStatus(url string) (response map[string]interface{}) {
 	// Create the HTTP request
-	resp, err := http.Get(url)
+	req, err := http.NewRequest("GET", url, nil)
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)
 	if err != nil {
 		fmt.Println("Error creating request:", err)
 		return
 	}
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		fmt.Println("Error sending request:", err)
+		return
+	}
 	defer resp.Body.Close()

 	body, err := io.ReadAll(resp.Body)
@@ -72,14 +83,15 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	return
 }

-func getModels(url string) (response []gallery.GalleryModel) {
+func getModels(url string) ([]gallery.GalleryModel, error) {
+	response := []gallery.GalleryModel{}
 	uri := downloader.URI(url)
 	// TODO: No tests currently seem to exercise file:// urls. Fix?
-	uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+	err := uri.DownloadWithAuthorizationAndCallback("", bearerKey, func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
-	return
+	return response, err
 }

 func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
@@ -101,6 +113,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 		return
 	}
 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	// Make the request
 	client := &http.Client{}
@@ -140,6 +153,7 @@ func postRequestJSON[B any](url string, bodyJson *B) error {
 	}

 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -175,6 +189,7 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	}

 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", bearerKey)

 	client := &http.Client{}
 	resp, err := client.Do(req)
@@ -195,6 +210,35 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
 	return json.Unmarshal(body, respJson)
 }

+func postInvalidRequest(url string) (error, int) {
+
+	req, err := http.NewRequest("POST", url, bytes.NewBufferString("invalid request"))
+	if err != nil {
+		return err, -1
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err, -1
+	}
+
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err, -1
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body)), resp.StatusCode
+	}
+
+	return nil, resp.StatusCode
+}
+
 //go:embed backend-assets/*
 var backendAssets embed.FS

@@ -260,6 +304,7 @@ var _ = Describe("API test", func() {
 					config.WithContext(c),
 					config.WithGalleries(galleries),
 					config.WithModelPath(modelDir),
+					config.WithApiKeys([]string{apiKey}),
 					config.WithBackendAssets(backendAssets),
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())
@@ -269,7 +314,7 @@ var _ = Describe("API test", func() {

 			go app.Listen("127.0.0.1:9090")

-			defaultConfig := openai.DefaultConfig("")
+			defaultConfig := openai.DefaultConfig(apiKey)
 			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"

 			client2 = openaigo.NewClient("")
@@ -295,10 +340,19 @@ var _ = Describe("API test", func() {
 			Expect(err).To(HaveOccurred())
 		})

+		Context("Auth Tests", func() {
+			It("Should fail if the api key is missing", func() {
+				err, sc := postInvalidRequest("http://127.0.0.1:9090/models/available")
+				Expect(err).ToNot(BeNil())
+				Expect(sc).To(Equal(403))
+			})
+		})
+
 		Context("Applying models", func() {

 			It("applies models from a gallery", func() {
-				models := getModels("http://127.0.0.1:9090/models/available")
+				models, err := getModels("http://127.0.0.1:9090/models/available")
+				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
 				Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
@@ -331,7 +385,8 @@ var _ = Describe("API test", func() {
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 				Expect(content["foo"]).To(Equal("bar"))

-				models = getModels("http://127.0.0.1:9090/models/available")
+				models, err = getModels("http://127.0.0.1:9090/models/available")
+				Expect(err).To(BeNil())
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
 				Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -640,8 +640,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 	for _, m := range input.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range input.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range input.Messages {
+		audios = append(audios, m.StringAudios...)
+	}

-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -27,9 +27,17 @@ func ComputeChoices(
 	for _, m := range req.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range req.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range req.Messages {
+		audios = append(audios, m.StringAudios...)
+	}

 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 	}

 	// Decode each request's message content
-	index := 0
+	imgIndex, vidIndex, audioIndex := 0, 0, 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
@@ -144,20 +144,44 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
+		CONTENT:
 			for _, pp := range c {
-				if pp.Type == "text" {
+				switch pp.Type {
+				case "text":
 					input.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
-					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
-					if err == nil {
-						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-						// set a placeholder for each image
-						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
-						index++
-					} else {
-						log.Error().Msgf("Failed encoding image: %s", err)
+				case "video", "video_url":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding video: %s", err)
+						continue CONTENT
 					}
+					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
+					vidIndex++
+				case "audio_url", "audio":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
+					audioIndex++
+				case "image_url", "image":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
+					imgIndex++
 				}
 			}
 		}
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -0,0 +1,94 @@
+package middleware
+
+import (
+	"crypto/subtle"
+	"errors"
+
+	"github.com/dave-gray101/v2keyauth"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/keyauth"
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// This file contains the configuration generators and handler functions that are used along with the fiber/keyauth middleware
+// Currently this requires an upstream patch - and feature patches are no longer accepted to v2
+// Therefore `dave-gray101/v2keyauth` contains the v2 backport of the middleware until v3 stabilizes and we migrate.
+
+func GetKeyAuthConfig(applicationConfig *config.ApplicationConfig) (*v2keyauth.Config, error) {
+	customLookup, err := v2keyauth.MultipleKeySourceLookup([]string{"header:Authorization", "header:x-api-key", "header:xi-api-key"}, keyauth.ConfigDefault.AuthScheme)
+	if err != nil {
+		return nil, err
+	}
+
+	return &v2keyauth.Config{
+		CustomKeyLookup: customLookup,
+		Next:            getApiKeyRequiredFilterFunction(applicationConfig),
+		Validator:       getApiKeyValidationFunction(applicationConfig),
+		ErrorHandler:    getApiKeyErrorHandler(applicationConfig),
+		AuthScheme:      "Bearer",
+	}, nil
+}
+
+func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.ErrorHandler {
+	return func(ctx *fiber.Ctx, err error) error {
+		if errors.Is(err, v2keyauth.ErrMissingOrMalformedAPIKey) {
+			if len(applicationConfig.ApiKeys) == 0 {
+				return ctx.Next() // if no keys are set up, any error we get here is not an error.
+			}
+			if applicationConfig.OpaqueErrors {
+				return ctx.SendStatus(403)
+			}
+			return ctx.Status(403).SendString(err.Error())
+		}
+		if applicationConfig.OpaqueErrors {
+			return ctx.SendStatus(500)
+		}
+		return err
+	}
+}
+
+func getApiKeyValidationFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx, string) (bool, error) {
+
+	if applicationConfig.UseSubtleKeyComparison {
+		return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
+			if len(applicationConfig.ApiKeys) == 0 {
+				return true, nil // If no keys are setup, accept everything
+			}
+			for _, validKey := range applicationConfig.ApiKeys {
+				if subtle.ConstantTimeCompare([]byte(apiKey), []byte(validKey)) == 1 {
+					return true, nil
+				}
+			}
+			return false, v2keyauth.ErrMissingOrMalformedAPIKey
+		}
+	}
+
+	return func(ctx *fiber.Ctx, apiKey string) (bool, error) {
+		if len(applicationConfig.ApiKeys) == 0 {
+			return true, nil // If no keys are setup, accept everything
+		}
+		for _, validKey := range applicationConfig.ApiKeys {
+			if apiKey == validKey {
+				return true, nil
+			}
+		}
+		return false, v2keyauth.ErrMissingOrMalformedAPIKey
+	}
+}
+
+func getApiKeyRequiredFilterFunction(applicationConfig *config.ApplicationConfig) func(*fiber.Ctx) bool {
+	if applicationConfig.DisableApiKeyRequirementForHttpGet {
+		return func(c *fiber.Ctx) bool {
+			if c.Method() != "GET" {
+				return false
+			}
+			for _, rx := range applicationConfig.HttpGetExemptedEndpoints {
+				if rx.MatchString(c.Path()) {
+					return true
+				}
+			}
+			return false
+		}
+	}
+	return func(c *fiber.Ctx) bool { return false }
+}
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -10,12 +10,11 @@ import (
 func RegisterElevenLabsRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {

 	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/text-to-speech/:voice-id", elevenlabs.TTSEndpoint(cl, ml, appConfig))

-	app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
+	app.Post("/v1/sound-generation", elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))

 }
--- a/core/http/routes/health.go
+++ b/core/http/routes/health.go
@@ -0,0 +1,13 @@
+package routes
+
+import "github.com/gofiber/fiber/v2"
+
+func HealthRoutes(app *fiber.App) {
+	// Service health checks
+	ok := func(c *fiber.Ctx) error {
+		return c.SendStatus(200)
+	}
+
+	app.Get("/healthz", ok)
+	app.Get("/readyz", ok)
+}
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -11,8 +11,7 @@ import (
 func RegisterJINARoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {

 	// POST endpoint to mimic the reranking
 	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -15,61 +15,52 @@ func RegisterLocalAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
+	galleryService *services.GalleryService) {

 	app.Get("/swagger/*", swagger.HandlerDefault) // default

 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
 		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-		app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-		app.Post("/models/delete/:name", auth, modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+		app.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+		app.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())

-		app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-		app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
-		app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
-		app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-		app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
-		app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+		app.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+		app.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
+		app.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
+		app.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+		app.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
+		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}

-	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))

 	// Stores
 	sl := model.NewModelLoader("")
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+	app.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))

-	// Kubernetes health checks
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
+	app.Get("/metrics", localai.LocalAIMetricsEndpoint())

 	// Experimental Backend Statistics Module
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitorService))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitorService))
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))

 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", auth, localai.ShowP2PNodes(appConfig))
-		app.Get("/api/p2p/token", auth, localai.ShowP2PToken(appConfig))
+		app.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
+		app.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
 	}

-	app.Get("/version", auth, func(c *fiber.Ctx) error {
+	app.Get("/version", func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Get("/system", auth, localai.SystemInformations(ml, appConfig))
+	app.Get("/system", localai.SystemInformations(ml, appConfig))

 }
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -11,66 +11,65 @@ import (
 func RegisterOpenAIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
+	appConfig *config.ApplicationConfig) {
 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))

 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))

 	// assistant
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))

 	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Post("/v1/files", openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/files", openai.UploadFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files", openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/files", openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
+	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))

 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))

 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))

 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))

 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))

 	if appConfig.ImageDir != "" {
 		app.Static("/generated-images", appConfig.ImageDir)
@@ -81,6 +80,6 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	}

 	// List models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", openai.ListModelsEndpoint(cl, ml))
+	app.Get("/models", openai.ListModelsEndpoint(cl, ml))
 }
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -59,8 +59,7 @@ func RegisterUIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
+	galleryService *services.GalleryService) {

 	// keeps the state of models that are being installed from the UI
 	var processingModels = NewModelOpCache()
@@ -85,10 +84,10 @@ func RegisterUIRoutes(app *fiber.App,
 		return processingModelsData, taskTypes
 	}

-	app.Get("/", auth, localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))
+	app.Get("/", localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))

 	if p2p.IsP2PEnabled() {
-		app.Get("/p2p", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p", func(c *fiber.Ctx) error {
 			summary := fiber.Map{
 				"Title":   "LocalAI - P2P dashboard",
 				"Version": internal.PrintableVersion(),
@@ -104,17 +103,17 @@ func RegisterUIRoutes(app *fiber.App,
 		})

 		/* show nodes live! */
-		app.Get("/p2p/ui/workers", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeBoxes(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})

-		app.Get("/p2p/ui/workers-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.WorkerID))))
 		})
-		app.Get("/p2p/ui/workers-federation-stats", auth, func(c *fiber.Ctx) error {
+		app.Get("/p2p/ui/workers-federation-stats", func(c *fiber.Ctx) error {
 			return c.SendString(elements.P2PNodeStats(p2p.GetAvailableNodes(p2p.NetworkID(appConfig.P2PNetworkID, p2p.FederatedID))))
 		})
 	}
@@ -122,7 +121,7 @@ func RegisterUIRoutes(app *fiber.App,
 	if !appConfig.DisableGalleryEndpoint {

 		// Show the Models page (all models)
-		app.Get("/browse", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse", func(c *fiber.Ctx) error {
 			term := c.Query("term")

 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
@@ -167,7 +166,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// Show the models, filtered from the user input
 		// https://htmx.org/examples/active-search/
-		app.Post("/browse/search/models", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/search/models", func(c *fiber.Ctx) error {
 			form := struct {
 				Search string `form:"search"`
 			}{}
@@ -188,7 +187,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/install/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/install/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to install  : %+v\n", galleryID)

@@ -215,7 +214,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
 		// https://htmx.org/examples/progress-bar/
-		app.Post("/browse/delete/model/:id", auth, func(c *fiber.Ctx) error {
+		app.Post("/browse/delete/model/:id", func(c *fiber.Ctx) error {
 			galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
 			log.Debug().Msgf("UI job submitted to delete  : %+v\n", galleryID)
 			var galleryName = galleryID
@@ -255,7 +254,7 @@ func RegisterUIRoutes(app *fiber.App,
 		// Display the job current progress status
 		// If the job is done, we trigger the /browse/job/:uid route
 		// https://htmx.org/examples/progress-bar/
-		app.Get("/browse/job/progress/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/progress/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!

 			status := galleryService.GetStatus(jobUID)
@@ -279,7 +278,7 @@ func RegisterUIRoutes(app *fiber.App,

 		// this route is hit when the job is done, and we display the
 		// final state (for now just displays "Installation completed")
-		app.Get("/browse/job/:uid", auth, func(c *fiber.Ctx) error {
+		app.Get("/browse/job/:uid", func(c *fiber.Ctx) error {
 			jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!

 			status := galleryService.GetStatus(jobUID)
@@ -303,7 +302,7 @@ func RegisterUIRoutes(app *fiber.App,
 	}

 	// Show the Chat page
-	app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/chat/:model", func(c *fiber.Ctx) error {
 		backendConfigs, _ := services.ListModels(cl, ml, "", true)

 		summary := fiber.Map{
@@ -318,7 +317,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})

-	app.Get("/talk/", auth, func(c *fiber.Ctx) error {
+	app.Get("/talk/", func(c *fiber.Ctx) error {
 		backendConfigs, _ := services.ListModels(cl, ml, "", true)

 		if len(backendConfigs) == 0 {
@@ -338,7 +337,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/talk", summary)
 	})

-	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
+	app.Get("/chat/", func(c *fiber.Ctx) error {

 		backendConfigs, _ := services.ListModels(cl, ml, "", true)

@@ -359,7 +358,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/chat", summary)
 	})

-	app.Get("/text2image/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()

 		summary := fiber.Map{
@@ -374,7 +373,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})

-	app.Get("/text2image/", auth, func(c *fiber.Ctx) error {
+	app.Get("/text2image/", func(c *fiber.Ctx) error {

 		backendConfigs := cl.GetAllBackendConfigs()

@@ -395,7 +394,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/text2image", summary)
 	})

-	app.Get("/tts/:model", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()

 		summary := fiber.Map{
@@ -410,7 +409,7 @@ func RegisterUIRoutes(app *fiber.App,
 		return c.Render("views/tts", summary)
 	})

-	app.Get("/tts/", auth, func(c *fiber.Ctx) error {
+	app.Get("/tts/", func(c *fiber.Ctx) error {

 		backendConfigs := cl.GetAllBackendConfigs()

--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -58,6 +58,8 @@ type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+	AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
+	VideoURL ContentURL `json:"video_url" yaml:"video_url"`
 }

 type ContentURL struct {
@@ -76,6 +78,8 @@ type Message struct {

 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+	StringVideos  []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
+	StringAudios  []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`

 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -1,206 +1,243 @@
-package startup
-
-import (
-	"fmt"
-	"os"
-
-	"github.com/mudler/LocalAI/core"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/internal"
-	"github.com/mudler/LocalAI/pkg/assets"
-	"github.com/mudler/LocalAI/pkg/library"
-	"github.com/mudler/LocalAI/pkg/model"
-	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
-	"github.com/mudler/LocalAI/pkg/xsysinfo"
-	"github.com/rs/zerolog/log"
-)
-
-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
-	options := config.NewApplicationConfig(opts...)
-
-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-	caps, err := xsysinfo.CPUCapabilities()
-	if err == nil {
-		log.Debug().Msgf("CPU capabilities: %v", caps)
-	}
-	gpus, err := xsysinfo.GPUs()
-	if err == nil {
-		log.Debug().Msgf("GPU count: %d", len(gpus))
-		for _, gpu := range gpus {
-			log.Debug().Msgf("GPU: %s", gpu.String())
-		}
-	}
-
-	// Make sure directories exists
-	if options.ModelPath == "" {
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
-	}
-	err = os.MkdirAll(options.ModelPath, 0750)
-	if err != nil {
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
-	}
-	if options.ImageDir != "" {
-		err := os.MkdirAll(options.ImageDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
-		}
-	}
-	if options.AudioDir != "" {
-		err := os.MkdirAll(options.AudioDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
-		}
-	}
-	if options.UploadDir != "" {
-		err := os.MkdirAll(options.UploadDir, 0750)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
-		}
-	}
-
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
-		log.Error().Err(err).Msg("error installing models")
-	}
-
-	cl := config.NewBackendConfigLoader(options.ModelPath)
-	ml := model.NewModelLoader(options.ModelPath)
-
-	configLoaderOpts := options.ToConfigLoaderOptions()
-
-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
-		log.Error().Err(err).Msg("error loading config files")
-	}
-
-	if options.ConfigFile != "" {
-		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
-			log.Error().Err(err).Msg("error loading config file")
-		}
-	}
-
-	if err := cl.Preload(options.ModelPath); err != nil {
-		log.Error().Err(err).Msg("error downloading models")
-	}
-
-	if options.PreloadJSONModels != "" {
-		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cl.GetAllBackendConfigs() {
-			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
-		}
-	}
-
-	if options.LibPath != "" {
-		// If there is a lib directory, set LD_LIBRARY_PATH to include it
-		err := library.LoadExternal(options.LibPath)
-		if err != nil {
-			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
-		}
-	}
-
-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		err := ml.StopAllGRPC()
-		if err != nil {
-			log.Error().Err(err).Msg("error while stopping all grpc backends")
-		}
-	}()
-
-	if options.WatchDog {
-		wd := model.NewWatchDog(
-			ml,
-			options.WatchDogBusyTimeout,
-			options.WatchDogIdleTimeout,
-			options.WatchDogBusy,
-			options.WatchDogIdle)
-		ml.SetWatchDog(wd)
-		go wd.Run()
-		go func() {
-			<-options.Context.Done()
-			log.Debug().Msgf("Context canceled, shutting down")
-			wd.Shutdown()
-		}()
-	}
-
-	// Watch the configuration directory
-	startWatcher(options)
-
-	log.Info().Msg("core/startup process completed!")
-	return cl, ml, options, nil
-}
-
-func startWatcher(options *config.ApplicationConfig) {
-	if options.DynamicConfigsDir == "" {
-		// No need to start the watcher if the directory is not set
-		return
-	}
-
-	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
-		if os.IsNotExist(err) {
-			// We try to create the directory if it does not exist and was specified
-			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
-				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
-			}
-		} else {
-			// something else happened, we log the error and don't start the watcher
-			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
-			return
-		}
-	}
-
-	configHandler := newConfigFileHandler(options)
-	if err := configHandler.Watch(); err != nil {
-		log.Error().Err(err).Msg("failed creating watcher")
-	}
-}
-
-// In Lieu of a proper DI framework, this function wires up the Application manually.
-// This is in core/startup rather than core/state.go to keep package references clean!
-func createApplication(appConfig *config.ApplicationConfig) *core.Application {
-	app := &core.Application{
-		ApplicationConfig:   appConfig,
-		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
-		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
-	}
-
-	var err error
-
-	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-
-	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
-	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
-
-	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
-	if err != nil {
-		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
-	}
-
-	return app
-}
+package startup
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/mudler/LocalAI/core"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/assets"
+	"github.com/mudler/LocalAI/pkg/library"
+	"github.com/mudler/LocalAI/pkg/model"
+	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/rs/zerolog/log"
+)
+
+func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+	options := config.NewApplicationConfig(opts...)
+
+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+	caps, err := xsysinfo.CPUCapabilities()
+	if err == nil {
+		log.Debug().Msgf("CPU capabilities: %v", caps)
+	}
+	gpus, err := xsysinfo.GPUs()
+	if err == nil {
+		log.Debug().Msgf("GPU count: %d", len(gpus))
+		for _, gpu := range gpus {
+			log.Debug().Msgf("GPU: %s", gpu.String())
+		}
+	}
+
+	// Make sure directories exists
+	if options.ModelPath == "" {
+		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+	}
+	err = os.MkdirAll(options.ModelPath, 0750)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+	}
+	if options.ImageDir != "" {
+		err := os.MkdirAll(options.ImageDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+		}
+	}
+	if options.AudioDir != "" {
+		err := os.MkdirAll(options.AudioDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+		}
+	}
+	if options.UploadDir != "" {
+		err := os.MkdirAll(options.UploadDir, 0750)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+		}
+	}
+
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+		log.Error().Err(err).Msg("error installing models")
+	}
+
+	cl := config.NewBackendConfigLoader(options.ModelPath)
+	ml := model.NewModelLoader(options.ModelPath)
+
+	configLoaderOpts := options.ToConfigLoaderOptions()
+
+	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+		log.Error().Err(err).Msg("error loading config files")
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
+			log.Error().Err(err).Msg("error loading config file")
+		}
+	}
+
+	if err := cl.Preload(options.ModelPath); err != nil {
+		log.Error().Err(err).Msg("error downloading models")
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
+			return nil, nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
+			return nil, nil, nil, err
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.GetAllBackendConfigs() {
+			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly)", err)
+		}
+	}
+
+	if options.LibPath != "" {
+		// If there is a lib directory, set LD_LIBRARY_PATH to include it
+		err := library.LoadExternal(options.LibPath)
+		if err != nil {
+			log.Error().Err(err).Str("LibPath", options.LibPath).Msg("Error while loading external libraries")
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		err := ml.StopAllGRPC()
+		if err != nil {
+			log.Error().Err(err).Msg("error while stopping all grpc backends")
+		}
+	}()
+
+	if options.WatchDog {
+		wd := model.NewWatchDog(
+			ml,
+			options.WatchDogBusyTimeout,
+			options.WatchDogIdleTimeout,
+			options.WatchDogBusy,
+			options.WatchDogIdle)
+		ml.SetWatchDog(wd)
+		go wd.Run()
+		go func() {
+			<-options.Context.Done()
+			log.Debug().Msgf("Context canceled, shutting down")
+			wd.Shutdown()
+		}()
+	}
+
+	if options.LoadToMemory != nil {
+		for _, m := range options.LoadToMemory {
+			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
+				config.LoadOptionDebug(options.Debug),
+				config.LoadOptionThreads(options.Threads),
+				config.LoadOptionContextSize(options.ContextSize),
+				config.LoadOptionF16(options.F16),
+				config.ModelPath(options.ModelPath),
+			)
+			if err != nil {
+				return nil, nil, nil, err
+			}
+
+			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
+
+			grpcOpts := backend.GRPCModelOpts(*cfg)
+			o := []model.Option{
+				model.WithModel(cfg.Model),
+				model.WithAssetDir(options.AssetsDestination),
+				model.WithThreads(uint32(options.Threads)),
+				model.WithLoadGRPCLoadModelOpts(grpcOpts),
+			}
+
+			var backendErr error
+			if cfg.Backend != "" {
+				o = append(o, model.WithBackendString(cfg.Backend))
+				_, backendErr = ml.BackendLoader(o...)
+			} else {
+				_, backendErr = ml.GreedyLoader(o...)
+			}
+			if backendErr != nil {
+				return nil, nil, nil, err
+			}
+		}
+	}
+
+	// Watch the configuration directory
+	startWatcher(options)
+
+	log.Info().Msg("core/startup process completed!")
+	return cl, ml, options, nil
+}
+
+func startWatcher(options *config.ApplicationConfig) {
+	if options.DynamicConfigsDir == "" {
+		// No need to start the watcher if the directory is not set
+		return
+	}
+
+	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
+		if os.IsNotExist(err) {
+			// We try to create the directory if it does not exist and was specified
+			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
+				log.Error().Err(err).Msg("failed creating DynamicConfigsDir")
+			}
+		} else {
+			// something else happened, we log the error and don't start the watcher
+			log.Error().Err(err).Msg("failed to read DynamicConfigsDir, watcher will not be started")
+			return
+		}
+	}
+
+	configHandler := newConfigFileHandler(options)
+	if err := configHandler.Watch(); err != nil {
+		log.Error().Err(err).Msg("failed creating watcher")
+	}
+}
+
+// In Lieu of a proper DI framework, this function wires up the Application manually.
+// This is in core/startup rather than core/state.go to keep package references clean!
+func createApplication(appConfig *config.ApplicationConfig) *core.Application {
+	app := &core.Application{
+		ApplicationConfig:   appConfig,
+		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
+		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
+	}
+
+	var err error
+
+	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+
+	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
+	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
+
+	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
+	if err != nil {
+		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
+	}
+
+	return app
+}
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with

 Images with `core` in the tag are smaller and do not contain any python dependencies. 

-{{< tabs tabTotal="6" >}}
+{{< tabs tabTotal="7" >}}
 {{% tab tabName="Vanilla / CPU Images" %}}

 | Description | Quay | Docker Hub                                   |
@@ -227,6 +227,15 @@ Images with `core` in the tag are smaller and do not contain any python dependen

 {{% /tab %}}

+
+{{% tab tabName="Vulkan Images" %}}
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai: master-vulkan-ffmpeg-core ` | `localai/localai: master-vulkan-ffmpeg-core `                      |
+| Latest tag | `quay.io/go-skynet/local-ai: latest-vulkan-ffmpeg-core ` | `localai/localai: latest-vulkan-ffmpeg-core`                 |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core`             |
+{{% /tab %}}
+
 {{< /tabs >}}

 ## See Also
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.20.1"
+  "version": "v2.21.0"
 }
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
--- a/embedded/embedded.go
+++ b/embedded/embedded.go
@@ -39,7 +39,7 @@ func init() {
 func GetRemoteLibraryShorteners(url string, basePath string) (map[string]string, error) {
 	remoteLibrary := map[string]string{}
 	uri := downloader.URI(url)
-	err := uri.DownloadAndUnmarshal(basePath, func(_ string, i []byte) error {
+	err := uri.DownloadWithCallback(basePath, func(_ string, i []byte) error {
 		return yaml.Unmarshal(i, &remoteLibrary)
 	})
 	if err != nil {
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -1,6 +1,6 @@
-llama_index==0.11.7
+llama_index==0.11.12
 requests==2.32.3
-weaviate_client==4.6.7
+weaviate_client==4.8.1
 transformers
 torch
 chainlit
--- a/examples/functions/requirements.txt
+++ b/examples/functions/requirements.txt
@@ -1,2 +1,2 @@
-langchain==0.2.16
-openai==1.44.0
+langchain==0.3.0
+openai==1.47.1
--- a/examples/langchain-chroma/requirements.txt
+++ b/examples/langchain-chroma/requirements.txt
@@ -1,4 +1,4 @@
-langchain==0.2.16
-openai==1.44.1
-chromadb==0.5.5
-llama-index==0.11.7
+langchain==0.3.0
+openai==1.47.1
+chromadb==0.5.7
+llama-index==0.11.12
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -8,9 +8,9 @@ colorama==0.4.6
 dataclasses-json==0.6.7
 debugpy==1.8.2
 frozenlist==1.4.1
-greenlet==3.0.3
-idna==3.8
-langchain==0.2.16
+greenlet==3.1.0
+idna==3.10
+langchain==0.3.0
 langchain-community==0.2.16
 marshmallow==3.22.0
 marshmallow-enum==1.5.1
@@ -18,16 +18,16 @@ multidict==6.0.5
 mypy-extensions==1.0.0
 numexpr==2.10.1
 numpy==2.1.1
-openai==1.44.0
+openai==1.45.1
 openapi-schema-pydantic==1.2.4
 packaging>=23.2
-pydantic==2.8.2
+pydantic==2.9.2
 PyYAML==6.0.2
 requests==2.32.3
-SQLAlchemy==2.0.32
+SQLAlchemy==2.0.35
 tenacity==8.5.0
 tqdm==4.66.5
 typing-inspect==0.9.0
 typing_extensions==4.12.2
-urllib3==2.2.2
-yarl==1.11.0
+urllib3==2.2.3
+yarl==1.11.1
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,166 @@
 ---
+## Qwen2.5
+- &qwen25
+  name: "qwen2.5-14b-instruct"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  license: apache-2.0
+  description: |
+    Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - qwen
+    - qwen2.5
+    - cpu
+  urls:
+    - https://huggingface.co/bartowski/Qwen2.5-14B-Instruct-GGUF
+    - https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+  overrides:
+    parameters:
+      model: Qwen2.5-14B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-14B-Instruct-Q4_K_M.gguf
+      sha256: e47ad95dad6ff848b431053b375adb5d39321290ea2c638682577dafca87c008
+      uri: huggingface://bartowski/Qwen2.5-14B-Instruct-GGUF/Qwen2.5-14B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-math-7b-instruct"
+  urls:
+    - https://huggingface.co/bartowski/Qwen2.5-Math-7B-Instruct-GGUF
+    - https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct
+  description: |
+      In August 2024, we released the first series of mathematical LLMs - Qwen2-Math - of our Qwen family. A month later, we have upgraded it and open-sourced Qwen2.5-Math series, including base models Qwen2.5-Math-1.5B/7B/72B, instruction-tuned models Qwen2.5-Math-1.5B/7B/72B-Instruct, and mathematical reward model Qwen2.5-Math-RM-72B.
+
+      Unlike Qwen2-Math series which only supports using Chain-of-Thught (CoT) to solve English math problems, Qwen2.5-Math series is expanded to support using both CoT and Tool-integrated Reasoning (TIR) to solve math problems in both Chinese and English. The Qwen2.5-Math series models have achieved significant performance improvements compared to the Qwen2-Math series models on the Chinese and English mathematics benchmarks with CoT.
+
+      The base models of Qwen2-Math are initialized with Qwen2-1.5B/7B/72B, and then pretrained on a meticulously designed Mathematics-specific Corpus. This corpus contains large-scale high-quality mathematical web texts, books, codes, exam questions, and mathematical pre-training data synthesized by Qwen2.
+  overrides:
+    parameters:
+      model: Qwen2.5-Math-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-Math-7B-Instruct-Q4_K_M.gguf
+      sha256: 7e03cee8c65b9ebf9ca14ddb010aca27b6b18e6c70f2779e94e7451d9529c091
+      uri: huggingface://bartowski/Qwen2.5-Math-7B-Instruct-GGUF/Qwen2.5-Math-7B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-14b_uncencored"
+  icon: https://huggingface.co/SicariusSicariiStuff/Phi-3.5-mini-instruct_Uncensored/resolve/main/Misc/Uncensored.png
+  urls:
+    - https://huggingface.co/SicariusSicariiStuff/Qwen2.5-14B_Uncencored
+    - https://huggingface.co/bartowski/Qwen2.5-14B_Uncencored-GGUF
+  description: |
+    Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
+    Uncensored qwen2.5
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - qwen
+    - qwen2.5
+    - cpu
+    - uncensored
+  overrides:
+    parameters:
+      model: Qwen2.5-14B_Uncencored-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-14B_Uncencored-Q4_K_M.gguf
+      sha256: 066b9341b67e0fd0956de3576a3b7988574a5b9a0028aef2b9c8edeadd6dbbd1
+      uri: huggingface://bartowski/Qwen2.5-14B_Uncencored-GGUF/Qwen2.5-14B_Uncencored-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-coder-7b-instruct"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF
+  description: |
+    Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen). For Qwen2.5-Coder, we release three base language models and instruction-tuned language models, 1.5, 7 and 32 (coming soon) billion parameters. Qwen2.5-Coder brings the following improvements upon CodeQwen1.5:
+
+        Significantly improvements in code generation, code reasoning and code fixing. Base on the strong Qwen2.5, we scale up the training tokens into 5.5 trillion including source code, text-code grounding, Synthetic data, etc.
+        A more comprehensive foundation for real-world applications such as Code Agents. Not only enhancing coding capabilities but also maintaining its strengths in mathematics and general competencies.
+        Long-context Support up to 128K tokens.
+  overrides:
+    parameters:
+      model: Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
+      sha256: 1664fccab734674a50763490a8c6931b70e3f2f8ec10031b54806d30e5f956b6
+      uri: huggingface://bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-math-72b-instruct"
+  icon: http://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2.5/qwen2.5-math-pipeline.jpeg
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-Math-72B-Instruct-GGUF
+  description: |
+    In August 2024, we released the first series of mathematical LLMs - Qwen2-Math - of our Qwen family. A month later, we have upgraded it and open-sourced Qwen2.5-Math series, including base models Qwen2.5-Math-1.5B/7B/72B, instruction-tuned models Qwen2.5-Math-1.5B/7B/72B-Instruct, and mathematical reward model Qwen2.5-Math-RM-72B.
+
+    Unlike Qwen2-Math series which only supports using Chain-of-Thught (CoT) to solve English math problems, Qwen2.5-Math series is expanded to support using both CoT and Tool-integrated Reasoning (TIR) to solve math problems in both Chinese and English. The Qwen2.5-Math series models have achieved significant performance improvements compared to the Qwen2-Math series models on the Chinese and English mathematics benchmarks with CoT
+  overrides:
+    parameters:
+      model: Qwen2.5-Math-72B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-Math-72B-Instruct-Q4_K_M.gguf
+      sha256: 5dee8a6e21d555577712b4f65565a3c3737a0d5d92f5a82970728c6d8e237f17
+      uri: huggingface://bartowski/Qwen2.5-Math-72B-Instruct-GGUF/Qwen2.5-Math-72B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-0.5b-instruct"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
+      sha256: 6eb923e7d26e9cea28811e1a8e852009b21242fb157b26149d3b188f3a8c8653
+      uri: huggingface://bartowski/Qwen2.5-0.5B-Instruct-GGUF/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-1.5b-instruct"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
+      sha256: 1adf0b11065d8ad2e8123ea110d1ec956dab4ab038eab665614adba04b6c3370
+      uri: huggingface://bartowski/Qwen2.5-1.5B-Instruct-GGUF/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-32b"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-32B
+    - https://huggingface.co/mradermacher/Qwen2.5-32B-GGUF
+  overrides:
+    parameters:
+      model: Qwen2.5-32B.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-32B.Q4_K_M.gguf
+      sha256: 02703e27c8b964db445444581a6937ad7538f0c32a100b26b49fa0e8ff527155
+      uri: huggingface://mradermacher/Qwen2.5-32B-GGUF/Qwen2.5-32B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-32b-instruct"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-32B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-32B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Qwen2.5-32B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-32B-Instruct-Q4_K_M.gguf
+      sha256: 2e5f6daea180dbc59f65a40641e94d3973b5dbaa32b3c0acf54647fa874e519e
+      uri: huggingface://bartowski/Qwen2.5-32B-Instruct-GGUF/Qwen2.5-32B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-72b-instruct"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
+    - https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Qwen2.5-72B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-72B-Instruct-Q4_K_M.gguf
+      sha256: e4c8fad16946be8cf0bbf67eb8f4e18fc7415a5a6d2854b4cda453edb4082545
+      uri: huggingface://bartowski/Qwen2.5-72B-Instruct-GGUF/Qwen2.5-72B-Instruct-Q4_K_M.gguf
 ## SmolLM
 - &smollm
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -439,6 +601,75 @@
    - filename: Reflection-Llama-3.1-70B-q4_k_m.gguf
      sha256: 16064e07037883a750cfeae9a7be41143aa857dbac81c2e93c68e2f941dee7b2
      uri: huggingface://senseable/Reflection-Llama-3.1-70B-gguf/Reflection-Llama-3.1-70B-q4_k_m.gguf
+- !!merge <<: *llama31
+  name: "llama-3.1-supernova-lite-reflection-v1.0-i1"
+  url: "github:mudler/LocalAI/gallery/llama3.1-reflective.yaml@master"
+  icon: https://i.ibb.co/r072p7j/eopi-ZVu-SQ0-G-Cav78-Byq-Tg.png
+  urls:
+    - https://huggingface.co/SE6446/Llama-3.1-SuperNova-Lite-Reflection-V1.0
+    - https://huggingface.co/mradermacher/Llama-3.1-SuperNova-Lite-Reflection-V1.0-i1-GGUF
+  description: |
+    This model is a LoRA adaptation of arcee-ai/Llama-3.1-SuperNova-Lite on thesven/Reflective-MAGLLAMA-v0.1.1. This has been a simple experiment into reflection and the model appears to perform adequately, though I am unsure if it is a large improvement.
+  overrides:
+    parameters:
+      model: Llama-3.1-SuperNova-Lite-Reflection-V1.0.i1-Q4_K_M.gguf
+  files:
+    - filename: Llama-3.1-SuperNova-Lite-Reflection-V1.0.i1-Q4_K_M.gguf
+      sha256: 0c4531fe553d00142808e1bc7348ae92d400794c5b64d2db1a974718324dfe9a
+      uri: huggingface://mradermacher/Llama-3.1-SuperNova-Lite-Reflection-V1.0-i1-GGUF/Llama-3.1-SuperNova-Lite-Reflection-V1.0.i1-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "llama-3.1-supernova-lite"
+  icon: https://i.ibb.co/r072p7j/eopi-ZVu-SQ0-G-Cav78-Byq-Tg.png
+  urls:
+    - https://huggingface.co/arcee-ai/Llama-3.1-SuperNova-Lite
+    - https://huggingface.co/arcee-ai/Llama-3.1-SuperNova-Lite-GGUF
+  description: |
+    Llama-3.1-SuperNova-Lite is an 8B parameter model developed by Arcee.ai, based on the Llama-3.1-8B-Instruct architecture. It is a distilled version of the larger Llama-3.1-405B-Instruct model, leveraging offline logits extracted from the 405B parameter variant. This 8B variation of Llama-3.1-SuperNova maintains high performance while offering exceptional instruction-following capabilities and domain-specific adaptability.
+
+    The model was trained using a state-of-the-art distillation pipeline and an instruction dataset generated with EvolKit, ensuring accuracy and efficiency across a wide range of tasks. For more information on its training, visit blog.arcee.ai.
+
+    Llama-3.1-SuperNova-Lite excels in both benchmark performance and real-world applications, providing the power of large-scale models in a more compact, efficient form ideal for organizations seeking high performance with reduced resource requirements.
+  overrides:
+    parameters:
+      model: supernova-lite-v1.Q4_K_M.gguf
+  files:
+    - filename: supernova-lite-v1.Q4_K_M.gguf
+      sha256: 237b7b0b704d294f92f36c576cc8fdc10592f95168a5ad0f075a2d8edf20da4d
+      uri: huggingface://arcee-ai/Llama-3.1-SuperNova-Lite-GGUF/supernova-lite-v1.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "llama3.1-8b-shiningvaliant2"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/63444f2687964b331809eb55/EXX7TKbB-R6arxww2mk0R.jpeg
+  urls:
+    - https://huggingface.co/ValiantLabs/Llama3.1-8B-ShiningValiant2
+    - https://huggingface.co/bartowski/Llama3.1-8B-ShiningValiant2-GGUF
+  description: |
+    Shining Valiant 2 is a chat model built on Llama 3.1 8b, finetuned on our data for friendship, insight, knowledge and enthusiasm.
+
+        Finetuned on meta-llama/Meta-Llama-3.1-8B-Instruct for best available general performance
+        Trained on a variety of high quality data; focused on science, engineering, technical knowledge, and structured reasoning
+  overrides:
+    parameters:
+      model: Llama3.1-8B-ShiningValiant2-Q4_K_M.gguf
+  files:
+    - filename: Llama3.1-8B-ShiningValiant2-Q4_K_M.gguf
+      sha256: 9369eb97922a9f01e4eae610e3d7aaeca30762d78d9239884179451d60bdbdd2
+      uri: huggingface://bartowski/Llama3.1-8B-ShiningValiant2-GGUF/Llama3.1-8B-ShiningValiant2-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "nightygurps-14b-v1.1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6336c5b3e3ac69e6a90581da/FvfjK7bKqsWdaBkB3eWgP.png
+  urls:
+    - https://huggingface.co/AlexBefest/NightyGurps-14b-v1.1
+    - https://huggingface.co/bartowski/NightyGurps-14b-v1.1-GGUF
+  description: |
+    This model works with Russian only.
+    This model is designed to run GURPS roleplaying games, as well as consult and assist. This model was trained on an augmented dataset of the GURPS Basic Set rulebook. Its primary purpose was initially to become an assistant consultant and assistant Game Master for the GURPS roleplaying system, but it can also be used as a GM for running solo games as a player.
+  overrides:
+    parameters:
+      model: NightyGurps-14b-v1.1-Q4_K_M.gguf
+  files:
+    - filename: NightyGurps-14b-v1.1-Q4_K_M.gguf
+      sha256: d09d53259ad2c0298150fa8c2db98fe42f11731af89fdc80ad0e255a19adc4b0
+      uri: huggingface://bartowski/NightyGurps-14b-v1.1-GGUF/NightyGurps-14b-v1.1-Q4_K_M.gguf
 ## Uncensored models
 - !!merge <<: *llama31
  name: "humanish-roleplay-llama-3.1-8b-i1"
@@ -658,6 +889,103 @@
    - filename: Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
      sha256: 88bfdca2f6077d789d3e0f161d19711aa208a6d9a02cce96a2276c69413b3594
      uri: huggingface://mradermacher/Mahou-1.3-llama3.1-8B-GGUF/Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "azure_dusk-v0.2-iq-imatrix"
+  # chatml
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/n3-g_YTk3FY-DBzxXd28E.png
+  urls:
+    - https://huggingface.co/Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix
+  description: |
+    "Following up on Crimson_Dawn-v0.2 we have Azure_Dusk-v0.2! Training on Mistral-Nemo-Base-2407 this time I've added significantly more data, as well as trained using RSLoRA as opposed to regular LoRA. Another key change is training on ChatML as opposed to Mistral Formatting."
+    by Author.
+  overrides:
+    parameters:
+      model: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
+  files:
+    - filename: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
+      sha256: c03a670c00976d14c267a0322374ed488b2a5f4790eb509136ca4e75cbc10cf4
+      uri: huggingface://Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix/Azure_Dusk-v0.2-Q4_K_M-imat.gguf
+- !!merge <<: *llama31
+  name: "l3.1-8b-niitama-v1.1-iq-imatrix"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/2Q5ky8TvP0vLS1ulMXnrn.png
+  urls:
+    - https://huggingface.co/Sao10K/L3.1-8B-Niitama-v1.1
+    - https://huggingface.co/Lewdiculous/L3.1-8B-Niitama-v1.1-GGUF-IQ-Imatrix
+  description: |
+   GGUF-IQ-Imatrix quants for Sao10K/L3.1-8B-Niitama-v1.1
+   Here's the subjectively superior L3 version: L3-8B-Niitama-v1
+   An experimental model using experimental methods.
+
+   More detail on it:
+
+   Tamamo and Niitama are made from the same data. Literally. The only thing that's changed is how theyre shuffled and formatted. Yet, I get wildly different results.
+
+   Interesting, eh? Feels kinda not as good compared to the l3 version, but it's aight.
+  overrides:
+    parameters:
+      model: L3.1-8B-Niitama-v1.1-Q4_K_M-imat.gguf
+  files:
+    - filename: L3.1-8B-Niitama-v1.1-Q4_K_M-imat.gguf
+      sha256: 524163bd0f1d43c9284b09118abcc192f3250b13dd3bb79d60c28321108b6748
+      uri: huggingface://Lewdiculous/L3.1-8B-Niitama-v1.1-GGUF-IQ-Imatrix/L3.1-8B-Niitama-v1.1-Q4_K_M-imat.gguf
+- !!merge <<: *llama31
+  name: "llama-3.1-8b-stheno-v3.4-iq-imatrix"
+  icon: https://huggingface.co/Sao10K/Llama-3.1-8B-Stheno-v3.4/resolve/main/meneno.jpg
+  urls:
+    - https://huggingface.co/Sao10K/Llama-3.1-8B-Stheno-v3.4
+    - https://huggingface.co/Lewdiculous/Llama-3.1-8B-Stheno-v3.4-GGUF-IQ-Imatrix
+  description: |
+    This model has went through a multi-stage finetuning process.
+
+    - 1st, over a multi-turn Conversational-Instruct
+    - 2nd, over a Creative Writing / Roleplay along with some Creative-based Instruct Datasets.
+    - - Dataset consists of a mixture of Human and Claude Data.
+
+    Prompting Format:
+
+    - Use the L3 Instruct Formatting - Euryale 2.1 Preset Works Well
+    - Temperature + min_p as per usual, I recommend 1.4 Temp + 0.2 min_p.
+    - Has a different vibe to previous versions. Tinker around.
+
+    Changes since previous Stheno Datasets:
+
+    - Included Multi-turn Conversation-based Instruct Datasets to boost multi-turn coherency. # This is a seperate set, not the ones made by Kalomaze and Nopm, that are used in Magnum. They're completely different data.
+    - Replaced Single-Turn Instruct with Better Prompts and Answers by Claude 3.5 Sonnet and Claude 3 Opus.
+    - Removed c2 Samples -> Underway of re-filtering and masking to use with custom prefills. TBD
+    - Included 55% more Roleplaying Examples based of [Gryphe's](https://huggingface.co/datasets/Gryphe/Sonnet3.5-Charcard-Roleplay) Charcard RP Sets. Further filtered and cleaned on.
+    - Included 40% More Creative Writing Examples.
+    - Included Datasets Targeting System Prompt Adherence.
+    - Included Datasets targeting Reasoning / Spatial Awareness.
+    - Filtered for the usual errors, slop and stuff at the end. Some may have slipped through, but I removed nearly all of it.
+
+    Personal Opinions:
+
+    - Llama3.1 was more disappointing, in the Instruct Tune? It felt overbaked, atleast. Likely due to the DPO being done after their SFT Stage.
+    - Tuning on L3.1 base did not give good results, unlike when I tested with Nemo base. unfortunate.
+    - Still though, I think I did an okay job. It does feel a bit more distinctive.
+    - It took a lot of tinkering, like a LOT to wrangle this.
+  overrides:
+    parameters:
+      model: Llama-3.1-8B-Stheno-v3.4-Q4_K_M-imat.gguf
+  files:
+    - filename: Llama-3.1-8B-Stheno-v3.4-Q4_K_M-imat.gguf
+      sha256: 830d4858aa11a654f82f69fa40dee819edf9ecf54213057648304eb84b8dd5eb
+      uri: huggingface://Lewdiculous/Llama-3.1-8B-Stheno-v3.4-GGUF-IQ-Imatrix/Llama-3.1-8B-Stheno-v3.4-Q4_K_M-imat.gguf
+- !!merge <<: *llama31
+  name: "llama-3.1-8b-arliai-rpmax-v1.1"
+  urls:
+    - https://huggingface.co/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1
+    - https://huggingface.co/bartowski/Llama-3.1-8B-ArliAI-RPMax-v1.1-GGUF
+  description: |
+    RPMax is a series of models that are trained on a diverse set of curated creative writing and RP datasets with a focus on variety and deduplication. This model is designed to be highly creative and non-repetitive by making sure no two entries in the dataset have repeated characters or situations, which makes sure the model does not latch on to a certain personality and be capable of understanding and acting appropriately to any characters or situations.
+  overrides:
+    parameters:
+      model: Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+  files:
+    - filename: Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+      sha256: 0a601c7341228d9160332965298d799369a1dc2b7080771fb8051bdeb556b30c
+      uri: huggingface://bartowski/Llama-3.1-8B-ArliAI-RPMax-v1.1-GGUF/Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
 - &deepseek
  ## Deepseek
  url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -1195,6 +1523,38 @@
    - filename: Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
      sha256: cf3465c183bf4ecbccd1b6b480f687e0160475b04c87e2f1e5ebc8baa0f4c7aa
      uri: huggingface://bartowski/Pantheon-RP-1.6-12b-Nemo-GGUF/Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "acolyte-22b-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6569a4ed2419be6072890cf8/3dcGMcrWK2-2vQh9QBt3o.png
+  urls:
+    - https://huggingface.co/rAIfle/Acolyte-22B
+    - https://huggingface.co/mradermacher/Acolyte-22B-i1-GGUF
+  description: |
+    LoRA of a bunch of random datasets on top of Mistral-Small-Instruct-2409, then SLERPed onto base at 0.5. Decent enough for its size. Check the LoRA for dataset info.
+  overrides:
+    parameters:
+      model: Acolyte-22B.i1-Q4_K_M.gguf
+  files:
+    - filename: Acolyte-22B.i1-Q4_K_M.gguf
+      sha256: 5a454405b98b6f886e8e4c695488d8ea098162bb8c46f2a7723fc2553c6e2f6e
+      uri: huggingface://mradermacher/Acolyte-22B-i1-GGUF/Acolyte-22B.i1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "mn-12b-lyra-v4-iq-imatrix"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/dVoru83WOpwVjMlgZ_xhA.png
+  # chatml
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  urls:
+    - https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix
+  description: |
+      A finetune of Mistral Nemo by Sao10K.
+      Uses the ChatML prompt format.
+  overrides:
+    parameters:
+      model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
+  files:
+    - filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
+      sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3
+      uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf
 - &mudler
  ### START mudler's LocalAI specific-models
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -1722,6 +2082,75 @@
    - filename: Athena-codegemma-2-2b-it.Q4_K_M.gguf
      sha256: 59ce17023438b0da603dd211c7d39f78e7acac4108258ac0818a97a4ca7d64e3
      uri: huggingface://mradermacher/Athena-codegemma-2-2b-it-GGUF/Athena-codegemma-2-2b-it.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "datagemma-rag-27b-it"
+  urls:
+    - https://huggingface.co/google/datagemma-rag-27b-it
+    - https://huggingface.co/bartowski/datagemma-rag-27b-it-GGUF
+  description: |
+    DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RAG is used with Retrieval Augmented Generation, where it is trained to take a user query and generate natural language queries that can be understood by Data Commons' existing natural language interface. More information can be found in this research paper.
+  overrides:
+    parameters:
+      model: datagemma-rag-27b-it-Q4_K_M.gguf
+  files:
+    - filename: datagemma-rag-27b-it-Q4_K_M.gguf
+      sha256: 3dfcf51b05e3f0ab0979ad194de350edea71cb14444efa0a9f2ef5bfc80753f8
+      uri: huggingface://bartowski/datagemma-rag-27b-it-GGUF/datagemma-rag-27b-it-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "datagemma-rig-27b-it"
+  urls:
+    - https://huggingface.co/google/datagemma-rig-27b-it
+    - https://huggingface.co/bartowski/datagemma-rig-27b-it-GGUF
+  description: |
+    DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RIG is used in the retrieval interleaved generation approach (based off of tool-use approaches), where it is trained to annotate a response with natural language queries to Data Commons’ existing natural language interface wherever there are statistics. More information can be found in this research paper.
+  overrides:
+    parameters:
+      model: datagemma-rig-27b-it-Q4_K_M.gguf
+  files:
+    - filename: datagemma-rig-27b-it-Q4_K_M.gguf
+      sha256: a6738ffbb49b6c46d220e2793df85c0538e9ac72398e32a0914ee5e55c3096ad
+      uri: huggingface://bartowski/datagemma-rig-27b-it-GGUF/datagemma-rig-27b-it-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "buddy-2b-v1"
+  urls:
+    - https://huggingface.co/TheDrummer/Buddy-2B-v1
+    - https://huggingface.co/bartowski/Buddy-2B-v1-GGUF
+  description: |
+    Buddy is designed as an empathetic language model, aimed at fostering introspection, self-reflection, and personal growth through thoughtful conversation. Buddy won't judge and it won't dismiss your concerns. Get some self-care with Buddy.
+  overrides:
+    parameters:
+      model: Buddy-2B-v1-Q4_K_M.gguf
+  files:
+    - filename: Buddy-2B-v1-Q4_K_M.gguf
+      sha256: 9bd25ed907d1a3c2e07fe09399a9b3aec107d368c29896e2c46facede5b7e3d5
+      uri: huggingface://bartowski/Buddy-2B-v1-GGUF/Buddy-2B-v1-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "gemma-2-9b-arliai-rpmax-v1.1"
+  urls:
+    - https://huggingface.co/ArliAI/Gemma-2-9B-ArliAI-RPMax-v1.1
+    - https://huggingface.co/bartowski/Gemma-2-9B-ArliAI-RPMax-v1.1-GGUF
+  description: |
+    RPMax is a series of models that are trained on a diverse set of curated creative writing and RP datasets with a focus on variety and deduplication. This model is designed to be highly creative and non-repetitive by making sure no two entries in the dataset have repeated characters or situations, which makes sure the model does not latch on to a certain personality and be capable of understanding and acting appropriately to any characters or situations.
+  overrides:
+    parameters:
+      model: Gemma-2-9B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+  files:
+    - filename: Gemma-2-9B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+      sha256: 1724aff0ad6f71bf4371d839aca55578f7ec6f030d8d25c0254126088e4c6250
+      uri: huggingface://bartowski/Gemma-2-9B-ArliAI-RPMax-v1.1-GGUF/Gemma-2-9B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "gemma-2-2b-arliai-rpmax-v1.1"
+  urls:
+    - https://huggingface.co/bartowski/Gemma-2-2B-ArliAI-RPMax-v1.1-GGUF
+  description: |
+    RPMax is a series of models that are trained on a diverse set of curated creative writing and RP datasets with a focus on variety and deduplication. This model is designed to be highly creative and non-repetitive by making sure no two entries in the dataset have repeated characters or situations, which makes sure the model does not latch on to a certain personality and be capable of understanding and acting appropriately to any characters or situations.
+  overrides:
+    parameters:
+      model: Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+  files:
+    - filename: Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
+      sha256: 89fe35345754d7e9de8d0c0d5bf35b2be9b12a09811b365b712b8b27112f7712
+      uri: huggingface://bartowski/Gemma-2-2B-ArliAI-RPMax-v1.1-GGUF/Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
 - &llama3
  url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -3601,7 +4030,7 @@
  files:
    - filename: NeuralDaredevil-8B-abliterated.Q4_K_M.gguf
      sha256: 12f4af9d66817d7d300bd9a181e4fe66f7ecf7ea972049f2cbd0554cdc3ecf05
-      uri: huggingface://QuantFactory/NeuralDaredevil-8B-abliterated-GGUF/Poppy_Porpoise-0.85-L3-8B-Q4_K_M-imat.gguf
+      uri: huggingface://QuantFactory/NeuralDaredevil-8B-abliterated-GGUF/NeuralDaredevil-8B-abliterated.Q4_K_M.gguf
 - !!merge <<: *llama3
  name: "llama-3-8b-instruct-mopeymule"
  urls:
--- a/gallery/llama3.1-reflective.yaml
+++ b/gallery/llama3.1-reflective.yaml
@@ -0,0 +1,65 @@
+---
+name: "llama3-instruct"
+
+config_file: |
+  mmap: true
+  cutstrings:
+  - (.*?)</thinking>
+  function:
+    disable_no_action: true
+    grammar:
+      disable: true
+    response_regex:
+    - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
+  template:
+    chat_message: |
+      <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+
+      {{ if .FunctionCall -}}
+      Function call:
+      {{ else if eq .RoleName "tool" -}}
+      Function response:
+      {{ end -}}
+      {{ if .Content -}}
+      {{.Content -}}
+      {{ else if .FunctionCall -}}
+      {{ toJson .FunctionCall -}}
+      {{ end -}}
+      <|eot_id|>
+    function: |
+      <|start_header_id|>system<|end_header_id|>
+
+      You have access to the following functions:
+
+      {{range .Functions}}
+      Use the function '{{.Name}}' to '{{.Description}}'
+      {{toJson .Parameters}}
+      {{end}}
+
+      Think very carefully before calling functions.
+      If a you choose to call a function ONLY reply in the following format with no prefix or suffix:
+
+      <function=example_function_name>{{`{{"example_name": "example_value"}}`}}</function>
+
+      Reminder:
+      - If looking for real time information use relevant functions before falling back to searching on internet
+      - Function calls MUST follow the specified format, start with <function= and end with </function>
+      - Required parameters MUST be specified
+      - Only call one function at a time
+      - Put the entire function call reply on one line
+      <|eot_id|>
+      {{.Input }}
+      <|start_header_id|>assistant<|end_header_id|>
+    chat: |
+      {{.Input }}
+      <|start_header_id|>assistant<|end_header_id|>
+      <thinking>
+    completion: |
+      {{.Input}}
+  context_size: 8192
+  f16: true
+  stopwords:
+  - <|im_end|>
+  - <dummy32000>
+  - "<|eot_id|>"
+  - <|end_of_text|>
--- a/go.mod
+++ b/go.mod
@@ -1,8 +1,8 @@
 module github.com/mudler/LocalAI

-go 1.22.0
+go 1.23

-toolchain go1.22.4
+toolchain go1.23.1

 require (
 	dario.cat/mergo v1.0.0
@@ -74,6 +74,7 @@ require (
 	cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
+	github.com/dave-gray101/v2keyauth v0.0.0-20240624150259-c45d584d25e2 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -110,6 +110,8 @@ github.com/creachadair/otp v0.4.2 h1:ngNMaD6Tzd7UUNRFyed7ykZFn/Wr5sSs5ffqZWm9pu8
 github.com/creachadair/otp v0.4.2/go.mod h1:DqV9hJyUbcUme0pooYfiFvvMe72Aua5sfhNzwfZvk40=
 github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
 github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
+github.com/dave-gray101/v2keyauth v0.0.0-20240624150259-c45d584d25e2 h1:flLYmnQFZNo04x2NPehMbf30m7Pli57xwZ0NFqR/hb0=
+github.com/dave-gray101/v2keyauth v0.0.0-20240624150259-c45d584d25e2/go.mod h1:NtWqRzAp/1tw+twkW8uuBenEVVYndEAZACWU3F3xdoQ=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -31,7 +31,11 @@ const (

 type URI string

-func (uri URI) DownloadAndUnmarshal(basePath string, f func(url string, i []byte) error) error {
+func (uri URI) DownloadWithCallback(basePath string, f func(url string, i []byte) error) error {
+	return uri.DownloadWithAuthorizationAndCallback(basePath, "", f)
+}
+
+func (uri URI) DownloadWithAuthorizationAndCallback(basePath string, authorization string, f func(url string, i []byte) error) error {
 	url := uri.ResolveURL()

 	if strings.HasPrefix(url, LocalPrefix) {
@@ -41,7 +45,6 @@ func (uri URI) DownloadAndUnmarshal(basePath string, f func(url string, i []byte
 		if err != nil {
 			return err
 		}
-		// ???
 		resolvedBasePath, err := filepath.EvalSymlinks(basePath)
 		if err != nil {
 			return err
@@ -63,7 +66,16 @@ func (uri URI) DownloadAndUnmarshal(basePath string, f func(url string, i []byte
 	}

 	// Send a GET request to the URL
-	response, err := http.Get(url)
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return err
+	}
+	if authorization != "" {
+		req.Header.Add("Authorization", authorization)
+	}
+
+	response, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return err
 	}
--- a/pkg/downloader/uri_test.go
+++ b/pkg/downloader/uri_test.go
@@ -11,7 +11,7 @@ var _ = Describe("Gallery API tests", func() {
 		It("parses github with a branch", func() {
 			uri := URI("github:go-skynet/model-gallery/gpt4all-j.yaml")
 			Expect(
-				uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+				uri.DownloadWithCallback("", func(url string, i []byte) error {
 					Expect(url).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
 					return nil
 				}),
@@ -21,7 +21,7 @@ var _ = Describe("Gallery API tests", func() {
 			uri := URI("github:go-skynet/model-gallery/gpt4all-j.yaml@main")

 			Expect(
-				uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+				uri.DownloadWithCallback("", func(url string, i []byte) error {
 					Expect(url).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
 					return nil
 				}),
@@ -30,7 +30,7 @@ var _ = Describe("Gallery API tests", func() {
 		It("parses github with urls", func() {
 			uri := URI("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml")
 			Expect(
-				uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
+				uri.DownloadWithCallback("", func(url string, i []byte) error {
 					Expect(url).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
 					return nil
 				}),
--- a/pkg/model/filters.go
+++ b/pkg/model/filters.go
@@ -0,0 +1,17 @@
+package model
+
+import (
+	process "github.com/mudler/go-processmanager"
+)
+
+type GRPCProcessFilter = func(id string, p *process.Process) bool
+
+func all(_ string, _ *process.Process) bool {
+	return true
+}
+
+func allExcept(s string) GRPCProcessFilter {
+	return func(id string, p *process.Process) bool {
+		return id != s
+	}
+}
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -320,7 +320,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		} else {
 			grpcProcess := backendPath(o.assetDir, backend)
 			if err := utils.VerifyPath(grpcProcess, o.assetDir); err != nil {
-				return nil, fmt.Errorf("grpc process not found in assetdir: %s", err.Error())
+				return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
 			}

 			if autoDetect {
@@ -332,7 +332,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

 			// Check if the file exists
 			if _, err := os.Stat(grpcProcess); os.IsNotExist(err) {
-				return nil, fmt.Errorf("grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS", grpcProcess)
+				return nil, fmt.Errorf("backend not found: %s", grpcProcess)
 			}

 			serverAddress, err := getFreeAddress()
@@ -355,6 +355,8 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			client = NewModel(serverAddress)
 		}

+		log.Debug().Msgf("Wait for the service to start up")
+
 		// Wait for the service to start up
 		ready := false
 		for i := 0; i < o.grpcAttempts; i++ {
@@ -413,10 +415,8 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 	}

 	if o.singleActiveBackend {
-		ml.mu.Lock()
 		log.Debug().Msgf("Stopping all backends except '%s'", o.model)
-		err := ml.StopAllExcept(o.model)
-		ml.mu.Unlock()
+		err := ml.StopGRPC(allExcept(o.model))
 		if err != nil {
 			log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel")
 			return nil, err
@@ -444,13 +444,10 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	o := NewOptions(opts...)

-	ml.mu.Lock()
-
 	// Return earlier if we have a model already loaded
 	// (avoid looping through all the backends)
 	if m := ml.CheckIsLoaded(o.model); m != nil {
 		log.Debug().Msgf("Model '%s' already loaded", o.model)
-		ml.mu.Unlock()

 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}
@@ -458,12 +455,11 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	// If we can have only one backend active, kill all the others (except external backends)
 	if o.singleActiveBackend {
 		log.Debug().Msgf("Stopping all backends except '%s'", o.model)
-		err := ml.StopAllExcept(o.model)
+		err := ml.StopGRPC(allExcept(o.model))
 		if err != nil {
 			log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
 		}
 	}
-	ml.mu.Unlock()

 	var err error

--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -69,6 +69,8 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".tar.gz",
 }

+const retryTimeout = time.Duration(2 * time.Minute)
+
 func (ml *ModelLoader) ListFilesInModelPath() ([]string, error) {
 	files, err := os.ReadDir(ml.ModelPath)
 	if err != nil {
@@ -116,9 +118,6 @@ func (ml *ModelLoader) ListModels() []*Model {
 }

 func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (*Model, error)) (*Model, error) {
-	ml.mu.Lock()
-	defer ml.mu.Unlock()
-
 	// Check if we already have a loaded model
 	if model := ml.CheckIsLoaded(modelName); model != nil {
 		return model, nil
@@ -137,6 +136,8 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
 		return nil, fmt.Errorf("loader didn't return a model")
 	}

+	ml.mu.Lock()
+	defer ml.mu.Unlock()
 	ml.models[modelName] = model

 	return model, nil
@@ -146,18 +147,28 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

-	return ml.stopModel(modelName)
-}
-
-func (ml *ModelLoader) stopModel(modelName string) error {
-	defer ml.deleteProcess(modelName)
-	if _, ok := ml.models[modelName]; !ok {
+	_, ok := ml.models[modelName]
+	if !ok {
 		return fmt.Errorf("model %s not found", modelName)
 	}
-	return nil
+
+	retries := 1
+	for ml.models[modelName].GRPC(false, ml.wd).IsBusy() {
+		log.Debug().Msgf("%s busy. Waiting.", modelName)
+		dur := time.Duration(retries*2) * time.Second
+		if dur > retryTimeout {
+			dur = retryTimeout
+		}
+		time.Sleep(dur)
+		retries++
+	}
+
+	return ml.deleteProcess(modelName)
 }

 func (ml *ModelLoader) CheckIsLoaded(s string) *Model {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
 	m, ok := ml.models[s]
 	if !ok {
 		return nil
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -9,27 +9,12 @@ import (
 	"strconv"
 	"strings"
 	"syscall"
-	"time"

 	"github.com/hpcloud/tail"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )

-func (ml *ModelLoader) StopAllExcept(s string) error {
-	return ml.StopGRPC(func(id string, p *process.Process) bool {
-		if id != s {
-			for ml.models[id].GRPC(false, ml.wd).IsBusy() {
-				log.Debug().Msgf("%s busy. Waiting.", id)
-				time.Sleep(2 * time.Second)
-			}
-			log.Debug().Msgf("[single-backend] Stopping %s", id)
-			return true
-		}
-		return false
-	})
-}
-
 func (ml *ModelLoader) deleteProcess(s string) error {
 	if _, exists := ml.grpcProcesses[s]; exists {
 		if err := ml.grpcProcesses[s].Stop(); err != nil {
@@ -41,17 +26,11 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 	return nil
 }

-type GRPCProcessFilter = func(id string, p *process.Process) bool
-
-func includeAllProcesses(_ string, _ *process.Process) bool {
-	return true
-}
-
 func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 	var err error = nil
 	for k, p := range ml.grpcProcesses {
 		if filter(k, p) {
-			e := ml.deleteProcess(k)
+			e := ml.ShutdownModel(k)
 			err = errors.Join(err, e)
 		}
 	}
@@ -59,10 +38,12 @@ func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 }

 func (ml *ModelLoader) StopAllGRPC() error {
-	return ml.StopGRPC(includeAllProcesses)
+	return ml.StopGRPC(all)
 }

 func (ml *ModelLoader) GetGRPCPID(id string) (int, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
 	p, exists := ml.grpcProcesses[id]
 	if !exists {
 		return -1, fmt.Errorf("no grpc backend found for %s", id)
--- a/pkg/utils/base64.go
+++ b/pkg/utils/base64.go
@@ -13,14 +13,8 @@ var base64DownloadClient http.Client = http.Client{
 	Timeout: 30 * time.Second,
 }

-// this function check if the string is an URL, if it's an URL downloads the image in memory
-// encodes it in base64 and returns the base64 string
-
-// This may look weird down in pkg/utils while it is currently only used in core/config
-//
-//	but I believe it may be useful for MQTT as well in the near future, so I'm
-//	extracting it while I'm thinking of it.
-func GetImageURLAsBase64(s string) (string, error) {
+// GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
+func GetContentURIAsBase64(s string) (string, error) {
 	if strings.HasPrefix(s, "http") {
 		// download the image
 		resp, err := base64DownloadClient.Get(s)
--- a/pkg/utils/base64_test.go
+++ b/pkg/utils/base64_test.go
@@ -10,20 +10,20 @@ var _ = Describe("utils/base64 tests", func() {
 	It("GetImageURLAsBase64 can strip jpeg data url prefixes", func() {
 		// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
 		input := "data:image/jpeg;base64,FOO"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).To(Equal("FOO"))
 	})
 	It("GetImageURLAsBase64 can strip png data url prefixes", func() {
 		// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
 		input := "data:image/png;base64,BAR"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).To(Equal("BAR"))
 	})
 	It("GetImageURLAsBase64 returns an error for bogus data", func() {
 		input := "FOO"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(b64).To(Equal(""))
 		Expect(err).ToNot(BeNil())
 		Expect(err).To(MatchError("not valid string"))
@@ -31,7 +31,7 @@ var _ = Describe("utils/base64 tests", func() {
 	It("GetImageURLAsBase64 can actually download images and calculates something", func() {
 		// This test doesn't actually _check_ the results at this time, which is bad, but there wasn't a test at all before...
 		input := "https://upload.wikimedia.org/wikipedia/en/2/29/Wargames.jpg"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).ToNot(BeNil())
 	})
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1394,6 +1394,12 @@ const docTemplate = `{
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "string_content": {
                    "type": "string"
                },
@@ -1403,6 +1409,12 @@ const docTemplate = `{
                        "type": "string"
                    }
                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_calls": {
                    "type": "array",
                    "items": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1387,6 +1387,12 @@
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "string_content": {
                    "type": "string"
                },
@@ -1396,6 +1402,12 @@
                        "type": "string"
                    }
                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_calls": {
                    "type": "array",
                    "items": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -453,12 +453,20 @@ definitions:
      role:
        description: The message role
        type: string
+      string_audios:
+        items:
+          type: string
+        type: array
      string_content:
        type: string
      string_images:
        items:
          type: string
        type: array
+      string_videos:
+        items:
+          type: string
+        type: array
      tool_calls:
        items:
          $ref: '#/definitions/schema.ToolCall'
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -171,7 +171,7 @@ var _ = Describe("E2E test", func() {
 		})
 		Context("vision", func() {
 			It("correctly", func() {
-				model := "gpt-4-vision-preview"
+				model := "gpt-4o"
 				resp, err := client.CreateChatCompletion(context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: model, Messages: []openai.ChatCompletionMessage{