fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
feat(vllm): add support for image-to-text and video-to-text (#3729 )
2026-05-24 08:38:02 -04:00 · 2024-10-05 00:07:58 +02:00 · 2024-10-04 23:42:05 +02:00 · 2024-10-04 19:52:43 +02:00 · 2024-10-04 18:32:29 +02:00 · 2024-10-04 09:10:07 +02:00
86 changed files with 1432 additions and 367 deletions
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -178,13 +178,22 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          # Install protoc
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
 ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/6
+++ b/6
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=70392f1f81470607ba3afef04aa56c9f65587664
+CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=0d2e2aed80109e8696791083bde3b58e190b7812
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -468,7 +468,7 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
-run-e2e-aio:
+run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
--- a/README.md
+++ b/README.md
@@ -68,9 +68,7 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 [💻 Getting started](https://localai.io/basics/getting_started/index.html)
-## 🔥🔥 Hot topics / Roadmap
+## 📰 Latest project news
 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
@@ -83,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
-Hot topics (looking for contributors):
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -26,6 +26,19 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
 }
 // Define the empty request
 message MetricsRequest {}
 message MetricsResponse {
  int32 slot_id = 1;
  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
  float tokens_per_second = 3;
  int32 tokens_generated = 4;
  int32 prompt_tokens_processed = 5;
 }
 message RerankRequest {
@@ -136,6 +149,7 @@ message PredictOptions {
  repeated Message Messages = 44;
  repeated string Videos = 45;
  repeated string Audios = 46;
  string CorrelationId = 47;
 }
 // The response message containing the result
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -495,6 +495,16 @@ struct llama_server_context
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
            if (slot.is_processing()) {
                return &slot;  // Return the active slot
            }
        }
        return nullptr;  // No active slot found
    }
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -2106,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();
    // Add the correlationid to json data
    data["correlation_id"] = predict->correlationid();
    // for each image in the request, add the image data
    //
    for (int i = 0; i < predict->images_size(); i++) {
@@ -2344,6 +2357,11 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);
                // Log Request Correlation Id
                LOG_VERBOSE("correlation:", {
                    { "id", data["correlation_id"] }
                });
                // Send the reply
                writer->Write(reply);
@@ -2367,6 +2385,12 @@ public:
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            // Log Request Correlation Id
            LOG_VERBOSE("correlation:", {
                { "id", data["correlation_id"] }
            });
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2406,6 +2430,31 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
        if (active_slot != nullptr) {
            // Calculate the tokens per second using existing logic
            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
            // Populate the response with metrics
            response->set_slot_id(active_slot->id);
            response->set_prompt_json_for_slot(active_slot->prompt.dump());
            response->set_tokens_per_second(tokens_per_second);
            response->set_tokens_generated(active_slot->n_decoded);
            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
        } else {
            // Handle case when no active slot exists
            response->set_slot_id(0);
            response->set_prompt_json_for_slot("");
            response->set_tokens_per_second(0);
            response->set_tokens_generated(0);
            response->set_prompt_tokens_processed(0);
        }
        return grpc::Status::OK;
    } 
 };
 void RunServer(const std::string& server_address) {
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 coqui-tts
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.66.2
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
@@ -18,6 +18,6 @@ python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==4.38.1
+gradio==4.44.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 datasets
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@ import argparse
 import signal
 import sys
 import os
 from typing import List
 from PIL import Image
 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
 from vllm.assets.video import VideoAsset
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        try:
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    async def Predict(self, request, context):
@@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        # Extract image paths and process images
        prompt = request.Prompt
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        image_paths = request.Images
        image_data = [self.load_image(img_path) for img_path in image_paths]
        videos_path = request.Videos
        video_data = [self.load_video(video_path) for video_path in videos_path]
        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-        # Generate text
+        # Generate text using the LLM engine
        request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
        outputs = self.llm.generate(
            {
                "prompt": prompt,
                "multi_modal_data": {
                    "image": image_data if image_data else None,
                    "video": video_data if video_data else None,
                } if image_data or video_data else None,
            },
            sampling_params=sampling_params,
            request_id=request_id,
        )
        # Stream the results
        generated_text = ""
@@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if streaming:
            return
        # Remove the image files from /tmp folder
        for img_path in image_paths:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        # Sending the final generated text
        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
    def load_image(self, image_path: str):
        """
        Load an image from the given file path.
        Args:
            image_path (str): The path to the image file.
        Returns:
            Image: The loaded image.
        """
        try:
            return Image.open(image_path)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
            return self.load_video(image_path)
    def load_video(self, video_path: str):
        """
        Load a video from the given file path.
        Args:
            video_path (str): The path to the image file.
        Returns:
            Video: The loaded video.
        """
        try:
            video = VideoAsset(name=video_path).np_ndarrays
            return video
        except Exception as e:
            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
            return None
 async def serve(address):
    # Start asyncio gRPC server
    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
        ensureVenv
        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
        if [ ! -d vllm ]; then
            git clone https://github.com/vllm-project/vllm
        fi
        pushd vllm
            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
            VLLM_TARGET_DEVICE=cpu python setup.py install
        popd
        rm -rf vllm
    else
        installRequirements
 fi
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -2,3 +2,4 @@
 accelerate
 torch
 transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
 torch
 transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -2,3 +2,4 @@
 accelerate
 torch
 transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -5,3 +5,4 @@ torch
 transformers
 optimum[openvino]
 setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 bitsandbytes
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.66.2
 protobuf
 certifi
 setuptools
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -10,20 +10,11 @@ import (
 )
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model
 	grpcOpts := GRPCModelOpts(backendConfig)
 	var inferenceModel interface{}
 	var err error
-	opts := modelOpts(backendConfig, appConfig, []model.Option{
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*backendConfig.Threads)),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 	})
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -8,19 +8,8 @@ import (
 )
 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-	threads := backendConfig.Threads
+
-	if *threads == 0 && appConfig.Threads != 0 {
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		threads = &appConfig.Threads
 	}
 	gRPCOpts := GRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithThreads(uint32(*threads)),
 		model.WithContext(appConfig.Context),
 		model.WithModel(backendConfig.Model),
 		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
 	})
 	inferenceModel, err := loader.BackendLoader(
 		opts...,
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -33,22 +33,11 @@ type TokenUsage struct {
 func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
 		threads = &o.Threads
 	}
 	grpcOpts := GRPCModelOpts(c)
 	var inferenceModel grpc.Backend
 	var err error
-	opts := modelOpts(c, o, []model.Option{
+	opts := ModelOptions(c, o, []model.Option{})
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 	})
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -11,32 +11,65 @@ import (
 	"github.com/rs/zerolog/log"
 )
-func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	name := c.Name
 	if name == "" {
 		name = c.Model
 	}
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithModel(c.Model),
 		model.WithAssetDir(so.AssetsDestination),
 		model.WithContext(so.Context),
 		model.WithModelID(name),
 	}
 	threads := 1
 	if c.Threads != nil {
 		threads = *c.Threads
 	}
 	if so.Threads != 0 {
 		threads = so.Threads
 	}
 	c.Threads = &threads
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
 	if so.SingleBackend {
-		opts = append(opts, model.WithSingleActiveBackend())
+		defOpts = append(defOpts, model.WithSingleActiveBackend())
 	}
 	if so.ParallelBackendRequests {
-		opts = append(opts, model.EnableParallelRequests)
+		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
 	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+		defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
 	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+		defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}
 	for k, v := range so.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
+		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}
-	return opts
+	return append(defOpts, opts...)
 }
 func getSeed(c config.BackendConfig) int32 {
-	seed := int32(*c.Seed)
+	var seed int32 = config.RAND_SEED
 	if c.Seed != nil {
 		seed = int32(*c.Seed)
 	}
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
 	}
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }
-func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
 	f16 := false
 	if c.F16 != nil {
 		f16 = *c.F16
 	}
 	embeddings := false
 	if c.Embeddings != nil {
 		embeddings = *c.Embeddings
 	}
 	lowVRAM := false
 	if c.LowVRAM != nil {
 		lowVRAM = *c.LowVRAM
 	}
 	mmap := false
 	if c.MMap != nil {
 		mmap = *c.MMap
 	}
 	ctxSize := 1024
 	if c.ContextSize != nil {
 		ctxSize = *c.ContextSize
 	}
 	mmlock := false
 	if c.MMlock != nil {
 		mmlock = *c.MMlock
 	}
 	nGPULayers := 9999999
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
@@ -56,14 +125,14 @@ func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CFGScale:             c.Diffusers.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
-		F16Memory:            *c.F16,
+		F16Memory:            f16,
 		LoraBase:             c.LoraBase,
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
-		ContextSize:          int32(*c.ContextSize),
+		ContextSize:          int32(ctxSize),
 		Seed:                 getSeed(c),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
@@ -85,16 +154,16 @@ func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnBetaSlow:         c.YarnBetaSlow,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		MLock:                *c.MMlock,
+		MLock:                mmlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
 		Type:                 c.ModelType,
 		RopeFreqScale:        c.RopeFreqScale,
 		NUMA:                 c.NUMA,
-		Embeddings:           *c.Embeddings,
+		Embeddings:           embeddings,
-		LowVRAM:              *c.LowVRAM,
+		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(*c.NGPULayers),
+		NGPULayers:           int32(nGPULayers),
-		MMap:                 *c.MMap,
+		MMap:                 mmap,
 		MainGPU:              c.MainGPU,
 		Threads:              int32(*c.Threads),
 		TensorSplit:          c.TensorSplit,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -9,21 +9,9 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
-func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
+func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	bb := backend
 	if bb == "" {
 		return nil, fmt.Errorf("backend is required")
 	}
-	grpcOpts := GRPCModelOpts(backendConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	rerankModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -13,7 +13,6 @@ import (
 )
 func SoundGeneration(
 	backend string,
 	modelFile string,
 	text string,
 	duration *float32,
@@ -25,18 +24,8 @@ func SoundGeneration(
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
 	if backend == "" {
 		return "", nil, fmt.Errorf("backend is a required parameter")
 	}
-	grpcOpts := GRPCModelOpts(backendConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(backend),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	soundGenModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -0,0 +1,33 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func TokenMetrics(
 	modelFile string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
 	opts := ModelOptions(backendConfig, appConfig, []model.Option{
 		model.WithModel(modelFile),
 	})
 	model, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
 	}
 	res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
 	return res, err
 }
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -0,0 +1,44 @@
 package backend
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 	modelFile := backendConfig.Model
 	var inferenceModel grpc.Backend
 	var err error
 	opts := ModelOptions(backendConfig, appConfig, []model.Option{
 		model.WithModel(modelFile),
 	})
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
 	// tokenize the string
 	resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	return schema.TokenizeResponse{
 		Tokens: resp.Tokens,
 	}, nil
 }
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -14,13 +14,11 @@ import (
 func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
-	opts := modelOpts(backendConfig, appConfig, []model.Option{
+	if backendConfig.Backend == "" {
-		model.WithBackendString(model.WhisperBackend),
+		backendConfig.Backend = model.WhisperBackend
-		model.WithModel(backendConfig.Model),
+	}
-		model.WithContext(appConfig.Context),
+
-		model.WithThreads(uint32(*backendConfig.Threads)),
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 		model.WithAssetDir(appConfig.AssetsDestination),
 	})
 	transcriptionModel, err := ml.BackendLoader(opts...)
 	if err != nil {
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -28,14 +28,9 @@ func ModelTTS(
 		bb = model.PiperBackend
 	}
-	grpcOpts := GRPCModelOpts(backendConfig)
+	opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	ttsModel, err := loader.BackendLoader(opts...)
 	if err != nil {
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -85,13 +85,14 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 	options.Backend = t.Backend
 	var inputFile *string
 	if t.InputFile != "" {
 		inputFile = &t.InputFile
 	}
-	filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
+	filePath, _, err := backend.SoundGeneration(t.Model, text,
 		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
 		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -17,6 +17,7 @@ import (
 type UtilCMD struct {
 	GGUFInfo         GGUFInfoCMD         `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
 	HFScan           HFScanCMD           `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
 	UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
 }
 type GGUFInfoCMD struct {
@@ -30,6 +31,11 @@ type HFScanCMD struct {
 	ToScan     []string `arg:""`
 }
 type UsecaseHeuristicCMD struct {
 	ConfigName string `name:"The config file to check"`
 	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 }
 func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	if u.Args == nil || len(u.Args) == 0 {
 		return fmt.Errorf("no GGUF file provided")
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
 		return nil
 	}
 }
 func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
 	if len(uhcmd.ConfigName) == 0 {
 		log.Error().Msg("ConfigName is a required parameter")
 		return fmt.Errorf("config name is a required parameter")
 	}
 	if len(uhcmd.ModelsPath) == 0 {
 		log.Error().Msg("ModelsPath is a required parameter")
 		return fmt.Errorf("model path is a required parameter")
 	}
 	bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
 	err := bcl.LoadBackendConfig(uhcmd.ConfigName)
 	if err != nil {
 		log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
 		return err
 	}
 	bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
 	if !exists {
 		log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
 	}
 	for name, uc := range config.GetAllBackendConfigUsecases() {
 		if bc.HasUsecases(uc) {
 			log.Info().Str("Usecase", name)
 		}
 	}
 	log.Info().Msg("---")
 	return nil
 }
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -3,11 +3,13 @@ package config
 import (
 	"os"
 	"regexp"
 	"slices"
 	"strings"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"gopkg.in/yaml.v3"
 )
 const (
@@ -34,6 +36,8 @@ type BackendConfig struct {
 	Embeddings          *bool                  `yaml:"embeddings"`
 	Backend             string                 `yaml:"backend"`
 	TemplateConfig      TemplateConfig         `yaml:"template"`
 	KnownUsecaseStrings []string               `yaml:"known_usecases"`
 	KnownUsecases       *BackendConfigUsecases `yaml:"-"`
 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -192,6 +196,21 @@ type TemplateConfig struct {
 	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
 	// It defaults to \n
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 	Video string `yaml:"video"`
 	Image string `yaml:"image"`
 	Audio string `yaml:"audio"`
 }
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
 	type BCAlias BackendConfig
 	var aux BCAlias
 	if err := value.Decode(&aux); err != nil {
 		return err
 	}
 	*c = BackendConfig(aux)
 	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
 	return nil
 }
 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -410,3 +429,121 @@ func (c *BackendConfig) Validate() bool {
 func (c *BackendConfig) HasTemplate() bool {
 	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
 }
 type BackendConfigUsecases int
 const (
 	FLAG_ANY              BackendConfigUsecases = 0b000000000
 	FLAG_CHAT             BackendConfigUsecases = 0b000000001
 	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
 	FLAG_EDIT             BackendConfigUsecases = 0b000000100
 	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
 	FLAG_RERANK           BackendConfigUsecases = 0b000010000
 	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
 	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
 	FLAG_TTS              BackendConfigUsecases = 0b010000000
 	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
 )
 func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 	return map[string]BackendConfigUsecases{
 		"FLAG_ANY":              FLAG_ANY,
 		"FLAG_CHAT":             FLAG_CHAT,
 		"FLAG_COMPLETION":       FLAG_COMPLETION,
 		"FLAG_EDIT":             FLAG_EDIT,
 		"FLAG_EMBEDDINGS":       FLAG_EMBEDDINGS,
 		"FLAG_RERANK":           FLAG_RERANK,
 		"FLAG_IMAGE":            FLAG_IMAGE,
 		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
 		"FLAG_TTS":              FLAG_TTS,
 		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
 		"FLAG_LLM":              FLAG_LLM,
 	}
 }
 func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	if len(input) == 0 {
 		return nil
 	}
 	result := FLAG_ANY
 	flags := GetAllBackendConfigUsecases()
 	for _, str := range input {
 		flag, exists := flags["FLAG_"+strings.ToUpper(str)]
 		if exists {
 			result |= flag
 		}
 	}
 	return &result
 }
 // HasUsecases examines a BackendConfig and determines which endpoints have a chance of success.
 func (c *BackendConfig) HasUsecases(u BackendConfigUsecases) bool {
 	if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
 		return true
 	}
 	return c.GuessUsecases(u)
 }
 // GuessUsecases is a **heuristic based** function, as the backend in question may not be loaded yet, and the config may not record what it's useful at.
 // In its current state, this function should ideally check for properties of the config like templates, rather than the direct backend name checks for the lower half.
 // This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
 func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 	if (u & FLAG_CHAT) == FLAG_CHAT {
 		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
 			return false
 		}
 	}
 	if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
 		if c.TemplateConfig.Completion == "" {
 			return false
 		}
 	}
 	if (u & FLAG_EDIT) == FLAG_EDIT {
 		if c.TemplateConfig.Edit == "" {
 			return false
 		}
 	}
 	if (u & FLAG_EMBEDDINGS) == FLAG_EMBEDDINGS {
 		if c.Embeddings == nil || !*c.Embeddings {
 			return false
 		}
 	}
 	if (u & FLAG_IMAGE) == FLAG_IMAGE {
 		imageBackends := []string{"diffusers", "tinydream", "stablediffusion"}
 		if !slices.Contains(imageBackends, c.Backend) {
 			return false
 		}
 		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
 			return false
 		}
 	}
 	if (u & FLAG_RERANK) == FLAG_RERANK {
 		if c.Backend != "rerankers" {
 			return false
 		}
 	}
 	if (u & FLAG_TRANSCRIPT) == FLAG_TRANSCRIPT {
 		if c.Backend != "whisper" {
 			return false
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
 		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
 	}
 	if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
 		if c.Backend != "transformers-musicgen" {
 			return false
 		}
 	}
 	return true
 }
--- a/core/config/backend_config_filter.go
+++ b/core/config/backend_config_filter.go
@@ -0,0 +1,35 @@
 package config
 import "regexp"
 type BackendConfigFilterFn func(string, *BackendConfig) bool
 func NoFilterFn(_ string, _ *BackendConfig) bool { return true }
 func BuildNameFilterFn(filter string) (BackendConfigFilterFn, error) {
 	if filter == "" {
 		return NoFilterFn, nil
 	}
 	rxp, err := regexp.Compile(filter)
 	if err != nil {
 		return nil, err
 	}
 	return func(name string, config *BackendConfig) bool {
 		if config != nil {
 			return rxp.MatchString(config.Name)
 		}
 		return rxp.MatchString(name)
 	}, nil
 }
 func BuildUsecaseFilterFn(usecases BackendConfigUsecases) BackendConfigFilterFn {
 	if usecases == FLAG_ANY {
 		return NoFilterFn
 	}
 	return func(name string, config *BackendConfig) bool {
 		if config == nil {
 			return false // TODO: Potentially make this a param, for now, no known usecase to include
 		}
 		return config.HasUsecases(usecases)
 	}
 }
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -201,6 +201,26 @@ func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
 	return res
 }
 func (bcl *BackendConfigLoader) GetBackendConfigsByFilter(filter BackendConfigFilterFn) []BackendConfig {
 	bcl.Lock()
 	defer bcl.Unlock()
 	var res []BackendConfig
 	if filter == nil {
 		filter = NoFilterFn
 	}
 	for n, v := range bcl.configs {
 		if filter(n, &v) {
 			res = append(res, v)
 		}
 	}
 	// TODO: I don't think this one needs to Sort on name... but we'll see what breaks.
 	return res
 }
 func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
 	bcl.Lock()
 	defer bcl.Unlock()
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -19,12 +19,17 @@ var _ = Describe("Test cases for config related functions", func() {
 				`backend: "../foo-bar"
 name: "foo"
 parameters:
-  model: "foo-bar"`)
+  model: "foo-bar"
 known_usecases:
 - chat
 - COMPLETION
 `)
 			Expect(err).ToNot(HaveOccurred())
 			config, err := readBackendConfigFromFile(tmp.Name())
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			Expect(config.Validate()).To(BeFalse())
 			Expect(config.KnownUsecases).ToNot(BeNil())
 		})
 		It("Test Validate", func() {
 			tmp, err := os.CreateTemp("", "config.yaml")
@@ -61,4 +66,99 @@ parameters:
 			Expect(config.Validate()).To(BeTrue())
 		})
 	})
 	It("Properly handles backend usecase matching", func() {
 		a := BackendConfig{
 			Name: "a",
 		}
 		Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
 		b := BackendConfig{
 			Name:    "b",
 			Backend: "stablediffusion",
 		}
 		Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
 		Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		c := BackendConfig{
 			Name:    "c",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Chat: "chat",
 			},
 		}
 		Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
 		Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
 		d := BackendConfig{
 			Name:    "d",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Chat:       "chat",
 				Completion: "completion",
 			},
 		}
 		Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
 		trueValue := true
 		e := BackendConfig{
 			Name:    "e",
 			Backend: "llama-cpp",
 			TemplateConfig: TemplateConfig{
 				Completion: "completion",
 			},
 			Embeddings: &trueValue,
 		}
 		Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
 		f := BackendConfig{
 			Name:    "f",
 			Backend: "piper",
 		}
 		Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
 		Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		g := BackendConfig{
 			Name:    "g",
 			Backend: "whisper",
 		}
 		Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
 		Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
 		h := BackendConfig{
 			Name:    "h",
 			Backend: "transformers-musicgen",
 		}
 		Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
 		Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
 		Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
 		knownUsecases := FLAG_CHAT | FLAG_COMPLETION
 		i := BackendConfig{
 			Name:    "i",
 			Backend: "whisper",
 			// Earlier test checks parsing, this just needs to set final values
 			KnownUsecases: &knownUsecases,
 		}
 		Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
 		Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
 	})
 })
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -19,14 +19,16 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
-
+	if ctx.Query("model") != "" {
 		modelInput = ctx.Query("model")
 	}
 	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
+	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := services.ListModels(cl, loader, "", true)
+		models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(models) > 0 {
 			modelInput = models[0]
 			log.Debug().Msgf("No model specified, using: %s", modelInput)
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -55,7 +55,7 @@ func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 		}
 		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
+		filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -45,13 +45,13 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		if input.Backend != "" {
@@ -64,7 +64,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			Documents: req.Documents,
 		}
-		results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
+		results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/get_token_metrics.go
+++ b/core/http/endpoints/localai/get_token_metrics.go
@@ -0,0 +1,60 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
 //
 //	@Summary	Get TokenMetrics for Active Slot.
 //	@Accept json
 //	@Produce audio/x-wav
 //	@Success	200		{string}	binary				"generated audio/wav file"
 //	@Router		/v1/tokenMetrics [get]
 //	@Router		/tokenMetrics [get]
 func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.TokenMetricsRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Token Metrics for model: %s", modelFile)
 		response, err := backend.TokenMetrics(modelFile, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 		return c.JSON(response)
 	}
 }
--- a/core/http/endpoints/localai/system.go
+++ b/core/http/endpoints/localai/system.go
@@ -17,12 +17,14 @@ func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
 		loadedModels := ml.ListModels()
 		for b := range appConfig.ExternalGRPCBackends {
 			availableBackends = append(availableBackends, b)
 		}
 		return c.JSON(
 			schema.SystemInformationResponse{
 				Backends: availableBackends,
 				Models:   loadedModels,
 			},
 		)
 	}
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@@ -0,0 +1,58 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 // TokenizeEndpoint exposes a REST API to tokenize the content
 // @Summary Tokenize the input.
 // @Success 200 {object} schema.TokenizeResponse "Response"
 // @Router /v1/tokenize [post]
 func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.TokenizeRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
 		if err != nil {
 			return err
 		}
 		c.JSON(tokenResponse)
 		return nil
 	}
 }
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -13,7 +13,7 @@ import (
 func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 	cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		models, _ := services.ListModels(cl, ml, "", true)
+		models, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		backendConfigs := cl.GetAllBackendConfigs()
 		galleryConfigs := map[string]*gallery.Config{}
@@ -32,18 +32,10 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		// Get model statuses to display in the UI the operation in progress
 		processingModels, taskTypes := modelStatus()
 		modelsWithoutConfig := []string{}
 		for _, m := range models {
 			if _, ok := modelsWithBackendConfig[m]; !ok {
 				modelsWithoutConfig = append(modelsWithoutConfig, m)
 			}
 		}
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
-			"Models":            modelsWithoutConfig,
+			"Models":            models,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
 			"IsP2PEnabled":      p2p.IsP2PEnabled(),
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -225,7 +225,7 @@ func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
 func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
 	found = false
-	models, err := services.ListModels(cl, ml, "", true)
+	models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 	if err != nil {
 		return
 	}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -161,6 +161,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
 		// Set CorrelationID
 		correlationID := c.Get("X-Correlation-ID")
 		if len(strings.TrimSpace(correlationID)) == 0 {
 			correlationID = id
 		}
 		c.Set("X-Correlation-ID", correlationID)
 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
@@ -444,6 +450,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
 			c.Set("X-Correlation-ID", id)
 			responses := make(chan schema.OpenAIResponse)
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -57,6 +57,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 	}
 	return func(c *fiber.Ctx) error {
 		// Add Correlation
 		c.Set("X-Correlation-ID", id)
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -18,32 +18,32 @@ func ListModelsEndpoint(bcl *config.BackendConfigLoader, ml *model.ModelLoader)
 		filter := c.Query("filter")
 		// By default, exclude any loose files that are already referenced by a configuration file.
-		excludeConfigured := c.QueryBool("excludeConfigured", true)
+		var policy services.LooseFilePolicy
 		if c.QueryBool("excludeConfigured", true) {
 			policy = services.SKIP_IF_CONFIGURED
 		} else {
 			policy = services.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
 		}
-		dataModels, err := modelList(bcl, ml, filter, excludeConfigured)
+		filterFn, err := config.BuildNameFilterFn(filter)
 		if err != nil {
 			return err
 		}
 		modelNames, err := services.ListModels(bcl, ml, filterFn, policy)
 		if err != nil {
 			return err
 		}
 		// Map from a slice of names to a slice of OpenAIModel response objects
 		dataModels := []schema.OpenAIModel{}
 		for _, m := range modelNames {
 			dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 		}
 		return c.JSON(schema.ModelsDataResponse{
 			Object: "list",
 			Data:   dataModels,
 		})
 	}
 }
 func modelList(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]schema.OpenAIModel, error) {
 	models, err := services.ListModels(bcl, ml, filter, excludeConfigured)
 	if err != nil {
 		return nil, err
 	}
 	dataModels := []schema.OpenAIModel{}
 	// Then iterate through the loose files:
 	for _, m := range models {
 		dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 	}
 	return dataModels, nil
 }
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -6,15 +6,22 @@ import (
 	"fmt"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 )
 type correlationIDKeyType string
 // CorrelationIDKey to track request across process boundary
 const CorrelationIDKey correlationIDKeyType = "correlationID"
 func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
 	input := new(schema.OpenAIRequest)
@@ -24,9 +31,14 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
 	}
 	received, _ := json.Marshal(input)
 	// Extract or generate the correlation ID
 	correlationID := c.Get("X-Correlation-ID", uuid.New().String())
 	ctx, cancel := context.WithCancel(o.Context)
-	input.Context = ctx
+	// Add the correlation ID to the new context
 	ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
 	input.Context = ctxWithCorrelationID
 	input.Cancel = cancel
 	log.Debug().Msgf("Request received: %s", string(received))
@@ -157,8 +169,13 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 						continue CONTENT
 					}
 					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
 					t := "[vid-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Video != "" {
 						t = config.TemplateConfig.Video
 					}
 					// set a placeholder for each image
-					input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
 					vidIndex++
 				case "audio_url", "audio":
 					// Decode content as base64 either if it's an URL or base64 text
@@ -169,7 +186,11 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 					}
 					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
 					// set a placeholder for each image
-					input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
+					t := "[audio-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Audio != "" {
 						t = config.TemplateConfig.Audio
 					}
 					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
 					audioIndex++
 				case "image_url", "image":
 					// Decode content as base64 either if it's an URL or base64 text
@@ -178,9 +199,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 						log.Error().Msgf("Failed encoding image: %s", err)
 						continue CONTENT
 					}
 					t := "[img-{{.ID}}]{{.Text}}"
 					if config.TemplateConfig.Image != "" {
 						t = config.TemplateConfig.Image
 					}
 					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
 					// set a placeholder for each image
-					input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
 					imgIndex++
 				}
 			}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -63,4 +63,7 @@ func RegisterLocalAIRoutes(app *fiber.App,
 	app.Get("/system", localai.SystemInformations(ml, appConfig))
 	// misc
 	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -303,7 +303,7 @@ func RegisterUIRoutes(app *fiber.App,
 	// Show the Chat page
 	app.Get("/chat/:model", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
@@ -318,7 +318,7 @@ func RegisterUIRoutes(app *fiber.App,
 	})
 	app.Get("/talk/", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -339,7 +339,7 @@ func RegisterUIRoutes(app *fiber.App,
 	app.Get("/chat/", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, "", true)
+		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
--- a/core/p2p/federated_server.go
+++ b/core/p2p/federated_server.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"net"
 	"github.com/mudler/edgevpn/pkg/node"
@@ -41,7 +42,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		log.Error().Err(err).Msg("Error listening")
 		return err
 	}
-	//	ll.Info("Binding local port on", srcaddr)
+
 	go func() {
 		<-ctx.Done()
 		l.Close()
@@ -82,6 +83,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 				if workerID == "" {
 					log.Error().Msg("No available nodes yet")
 					fs.sendHTMLResponse(conn, 503, "Sorry, waiting for nodes to connect")
 					return
 				}
@@ -89,6 +91,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 				nodeData, exists := GetNode(fs.service, workerID)
 				if !exists {
 					log.Error().Msgf("Node %s not found", workerID)
 					fs.sendHTMLResponse(conn, 404, "Node not found")
 					return
 				}
@@ -100,3 +103,42 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
 		}
 	}
 }
 // sendHTMLResponse sends a basic HTML response with a status code and a message.
 // This is extracted to make the HTML content maintainable.
 func (fs *FederatedServer) sendHTMLResponse(conn net.Conn, statusCode int, message string) {
 	defer conn.Close()
 	// Define the HTML content separately for easier maintenance.
 	htmlContent := fmt.Sprintf("<html><body><h1>%s</h1></body></html>\r\n", message)
 	// Create the HTTP response with dynamic status code and content.
 	response := fmt.Sprintf(
 		"HTTP/1.1 %d %s\r\n"+
 			"Content-Type: text/html\r\n"+
 			"Connection: close\r\n"+
 			"\r\n"+
 			"%s",
 		statusCode, getHTTPStatusText(statusCode), htmlContent,
 	)
 	// Write the response to the client connection.
 	_, writeErr := io.WriteString(conn, response)
 	if writeErr != nil {
 		log.Error().Err(writeErr).Msg("Error writing response to client")
 	}
 }
 // getHTTPStatusText returns a textual representation of HTTP status codes.
 func getHTTPStatusText(statusCode int) string {
 	switch statusCode {
 	case 503:
 		return "Service Unavailable"
 	case 404:
 		return "Not Found"
 	case 200:
 		return "OK"
 	default:
 		return "Unknown Status"
 	}
 }
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -2,6 +2,7 @@ package schema
 import (
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/pkg/model"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )
@@ -9,6 +10,10 @@ type BackendMonitorRequest struct {
 	Model string `json:"model" yaml:"model"`
 }
 type TokenMetricsRequest struct {
 	Model string `json:"model" yaml:"model"`
 }
 type BackendMonitorResponse struct {
 	MemoryInfo    *gopsutil.MemoryInfoStat
 	MemoryPercent float32
@@ -73,4 +78,5 @@ type P2PNodesResponse struct {
 type SystemInformationResponse struct {
 	Backends []string      `json:"backends"`
 	Models   []model.Model `json:"loaded_models"`
 }
--- a/core/schema/tokenize.go
+++ b/core/schema/tokenize.go
@@ -0,0 +1,10 @@
 package schema
 type TokenizeRequest struct {
 	Content string `json:"content"`
 	Model   string `json:"model"`
 }
 type TokenizeResponse struct {
 	Tokens []int32 `json:"tokens"`
 }
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -1,57 +1,49 @@
 package services
 import (
 	"regexp"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/model"
 )
-func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]string, error) {
+type LooseFilePolicy int
 const (
 	SKIP_IF_CONFIGURED LooseFilePolicy = iota
 	SKIP_ALWAYS
 	ALWAYS_INCLUDE
 	LOOSE_ONLY
 )
 func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
 	var skipMap map[string]interface{} = map[string]interface{}{}
 	dataModels := []string{}
 	// Start with known configurations
 	if looseFilePolicy != LOOSE_ONLY {
 		for _, c := range bcl.GetBackendConfigsByFilter(filter) {
 			if looseFilePolicy == SKIP_IF_CONFIGURED {
 				skipMap[c.Model] = nil
 			}
 			dataModels = append(dataModels, c.Name)
 		}
 	}
 	// Then iterate through the loose files if requested.
 	if looseFilePolicy != SKIP_ALWAYS {
 		models, err := ml.ListFilesInModelPath()
 		if err != nil {
 			return nil, err
 		}
 	var mm map[string]interface{} = map[string]interface{}{}
 	dataModels := []string{}
 	var filterFn func(name string) bool
 	// If filter is not specified, do not filter the list by model name
 	if filter == "" {
 		filterFn = func(_ string) bool { return true }
 	} else {
 		// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
 		rxp, err := regexp.Compile(filter)
 		if err != nil {
 			return nil, err
 		}
 		filterFn = func(name string) bool {
 			return rxp.MatchString(name)
 		}
 	}
 	// Start with the known configurations
 	for _, c := range bcl.GetAllBackendConfigs() {
 		if excludeConfigured {
 			mm[c.Model] = nil
 		}
 		if filterFn(c.Name) {
 			dataModels = append(dataModels, c.Name)
 		}
 	}
 	// Then iterate through the loose files:
 		for _, m := range models {
 			// And only adds them if they shouldn't be skipped.
-		if _, exists := mm[m]; !exists && filterFn(m) {
+			if _, exists := skipMap[m]; !exists && filter(m, nil) {
 				dataModels = append(dataModels, m)
 			}
 		}
 	}
 	return dataModels, nil
 }
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -160,13 +160,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
-			grpcOpts := backend.GRPCModelOpts(*cfg)
+			o := backend.ModelOptions(*cfg, options, []model.Option{})
 			o := []model.Option{
 				model.WithModel(cfg.Model),
 				model.WithAssetDir(options.AssetsDestination),
 				model.WithThreads(uint32(options.Threads)),
 				model.WithLoadGRPCLoadModelOpts(grpcOpts),
 			}
 			var backendErr error
 			if cfg.Backend != "" {
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.21.0"
+  "version": "v2.21.1"
 }
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -1,4 +1,4 @@
-llama_index==0.11.12
+llama_index==0.11.14
 requests==2.32.3
 weaviate_client==4.8.1
 transformers
--- a/examples/functions/requirements.txt
+++ b/examples/functions/requirements.txt
@@ -1,2 +1,2 @@
-langchain==0.3.0
+langchain==0.3.1
-openai==1.47.1
+openai==1.50.2
--- a/examples/langchain-chroma/requirements.txt
+++ b/examples/langchain-chroma/requirements.txt
@@ -1,4 +1,4 @@
-langchain==0.3.0
+langchain==0.3.1
-openai==1.47.1
+openai==1.50.2
-chromadb==0.5.7
+chromadb==0.5.11
-llama-index==0.11.12
+llama-index==0.11.14
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -1,4 +1,4 @@
-aiohttp==3.10.3
+aiohttp==3.10.8
 aiosignal==1.3.1
 async-timeout==4.0.3
 attrs==24.2.0
@@ -8,10 +8,10 @@ colorama==0.4.6
 dataclasses-json==0.6.7
 debugpy==1.8.2
 frozenlist==1.4.1
-greenlet==3.1.0
+greenlet==3.1.1
 idna==3.10
-langchain==0.3.0
+langchain==0.3.1
-langchain-community==0.2.16
+langchain-community==0.3.1
 marshmallow==3.22.0
 marshmallow-enum==1.5.1
 multidict==6.0.5
@@ -30,4 +30,4 @@ tqdm==4.66.5
 typing-inspect==0.9.0
 typing_extensions==4.12.2
 urllib3==2.2.3
-yarl==1.11.1
+yarl==1.13.1
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,6 +1,90 @@
 ---
-## Qwen2.5
+- name: "salamandra-7b-instruct"
  icon: https://huggingface.co/BSC-LT/salamandra-7b-instruct/resolve/main/images/salamandra_header.png
  # Uses chatml
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
  license: apache-2.0
  urls:
    - https://huggingface.co/BSC-LT/salamandra-7b-instruct
    - https://huggingface.co/cstr/salamandra-7b-instruct-GGUF
  tags:
    - llm
    - gguf
    - gpu
    - cpu
    - salamandra
  description: |
    Transformer-based decoder-only language model that has been pre-trained on 7.8 trillion tokens of highly curated data. The pre-training corpus contains text in 35 European languages and code.
    Salamandra comes in three different sizes — 2B, 7B and 40B parameters — with their respective base and instruction-tuned variants. This model card corresponds to the 7B instructed version.
  overrides:
    parameters:
      model: salamandra-7b-instruct.Q4_K_M-f32.gguf
  files:
    - filename: salamandra-7b-instruct.Q4_K_M-f32.gguf
      sha256: bac8e8c1d1d9d53cbdb148b8ff9ad378ddb392429207099e85b5aae3a43bff3d
      uri: huggingface://cstr/salamandra-7b-instruct-GGUF/salamandra-7b-instruct.Q4_K_M-f32.gguf
 ## llama3.2
 - &llama32
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
  license: llama3.2
  description: |
    The Meta Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned generative models in 1B and 3B sizes (text in/text out). The Llama 3.2 instruction-tuned text only models are optimized for multilingual dialogue use cases, including agentic retrieval and summarization tasks. They outperform many of the available open source and closed chat models on common industry benchmarks.
    Model Developer: Meta
    Model Architecture: Llama 3.2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety.
  tags:
    - llm
    - gguf
    - gpu
    - cpu
    - llama3.2
  name: "llama-3.2-1b-instruct:q4_k_m"
  urls:
    - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
  overrides:
    parameters:
      model: llama-3.2-1b-instruct-q4_k_m.gguf
  files:
    - filename: llama-3.2-1b-instruct-q4_k_m.gguf
      sha256: 1d0e9419ec4e12aef73ccf4ffd122703e94c48344a96bc7c5f0f2772c2152ce3
      uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
 - !!merge <<: *llama32
  name: "llama-3.2-3b-instruct:q4_k_m"
  urls:
    - https://huggingface.co/hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF
  overrides:
    parameters:
      model: llama-3.2-3b-instruct-q4_k_m.gguf
  files:
    - filename: llama-3.2-3b-instruct-q4_k_m.gguf
      sha256: c55a83bfb6396799337853ca69918a0b9bbb2917621078c34570bc17d20fd7a1
      uri: huggingface://hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF/llama-3.2-3b-instruct-q4_k_m.gguf
 - !!merge <<: *llama32
  name: "llama-3.2-3b-instruct:q8_0"
  urls:
    - https://huggingface.co/hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF
  overrides:
    parameters:
      model: llama-3.2-3b-instruct-q8_0.gguf
  files:
    - filename: llama-3.2-3b-instruct-q8_0.gguf
      sha256: 51725f77f997a5080c3d8dd66e073da22ddf48ab5264f21f05ded9b202c3680e
      uri: huggingface://hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF/llama-3.2-3b-instruct-q8_0.gguf
 - !!merge <<: *llama32
  name: "llama-3.2-1b-instruct:q8_0"
  urls:
    - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF
  overrides:
    parameters:
      model: llama-3.2-1b-instruct-q8_0.gguf
  files:
    - filename: llama-3.2-1b-instruct-q8_0.gguf
      sha256: ba345c83bf5cc679c653b853c46517eea5a34f03ed2205449db77184d9ae62a9
      uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/llama-3.2-1b-instruct-q8_0.gguf
 - &qwen25
  ## Qwen2.5
  name: "qwen2.5-14b-instruct"
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
  license: apache-2.0
@@ -135,8 +219,8 @@
      model: Qwen2.5-32B.Q4_K_M.gguf
  files:
    - filename: Qwen2.5-32B.Q4_K_M.gguf
      sha256: 02703e27c8b964db445444581a6937ad7538f0c32a100b26b49fa0e8ff527155
      uri: huggingface://mradermacher/Qwen2.5-32B-GGUF/Qwen2.5-32B.Q4_K_M.gguf
      sha256: fa42a4067e3630929202b6bb1ef5cebc43c1898494aedfd567b7d53c7a9d84a6
 - !!merge <<: *qwen25
  name: "qwen2.5-32b-instruct"
  urls:
@@ -161,8 +245,82 @@
    - filename: Qwen2.5-72B-Instruct-Q4_K_M.gguf
      sha256: e4c8fad16946be8cf0bbf67eb8f4e18fc7415a5a6d2854b4cda453edb4082545
      uri: huggingface://bartowski/Qwen2.5-72B-Instruct-GGUF/Qwen2.5-72B-Instruct-Q4_K_M.gguf
-## SmolLM
+- !!merge <<: *qwen25
  name: "bigqwen2.5-52b-instruct"
  icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/98GiKtmH1AtHHbIbOUH4Y.jpeg
  urls:
    - https://huggingface.co/mlabonne/BigQwen2.5-52B-Instruct
    - https://huggingface.co/bartowski/BigQwen2.5-52B-Instruct-GGUF
  description: |
    BigQwen2.5-52B-Instruct is a Qwen/Qwen2-32B-Instruct self-merge made with MergeKit.
    It applies the mlabonne/Meta-Llama-3-120B-Instruct recipe.
  overrides:
    parameters:
      model: BigQwen2.5-52B-Instruct-Q4_K_M.gguf
  files:
    - filename: BigQwen2.5-52B-Instruct-Q4_K_M.gguf
      sha256: 9c939f08e366b51b07096eb2ecb5cc2a82894ac7baf639e446237ad39889c896
      uri: huggingface://bartowski/BigQwen2.5-52B-Instruct-GGUF/BigQwen2.5-52B-Instruct-Q4_K_M.gguf
 - !!merge <<: *qwen25
  name: "replete-llm-v2.5-qwen-14b"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/ihnWXDEgV-ZKN_B036U1J.png
  urls:
    - https://huggingface.co/Replete-AI/Replete-LLM-V2.5-Qwen-14b
    - https://huggingface.co/bartowski/Replete-LLM-V2.5-Qwen-14b-GGUF
  description: |
    Replete-LLM-V2.5-Qwen-14b is a continues finetuned version of Qwen2.5-14B. I noticed recently that the Qwen team did not learn from my methods of continuous finetuning, the great benefits, and no downsides of it. So I took it upon myself to merge the instruct model with the base model myself using the Ties merge method
    This version of the model shows higher performance than the original instruct and base models.
  overrides:
    parameters:
      model: Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
  files:
    - filename: Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
      sha256: 17d0792ff5e3062aecb965629f66e679ceb407e4542e8045993dcfe9e7e14d9d
      uri: huggingface://bartowski/Replete-LLM-V2.5-Qwen-14b-GGUF/Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
 - !!merge <<: *qwen25
  name: "replete-llm-v2.5-qwen-7b"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/ihnWXDEgV-ZKN_B036U1J.png
  urls:
    - https://huggingface.co/Replete-AI/Replete-LLM-V2.5-Qwen-7b
    - https://huggingface.co/bartowski/Replete-LLM-V2.5-Qwen-7b-GGUF
  description: |
    Replete-LLM-V2.5-Qwen-7b is a continues finetuned version of Qwen2.5-14B. I noticed recently that the Qwen team did not learn from my methods of continuous finetuning, the great benefits, and no downsides of it. So I took it upon myself to merge the instruct model with the base model myself using the Ties merge method
    This version of the model shows higher performance than the original instruct and base models.
  overrides:
    parameters:
      model: Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
  files:
    - filename: Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
      sha256: 054d54972259c0398b4e0af3f408f608e1166837b1d7535d08fc440d1daf8639
      uri: huggingface://bartowski/Replete-LLM-V2.5-Qwen-7b-GGUF/Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
 - !!merge <<: *qwen25
  name: "calme-2.2-qwen2.5-72b-i1"
  icon: https://huggingface.co/MaziyarPanahi/calme-2.2-qwen2.5-72b/resolve/main/calme-2.webp
  urls:
    - https://huggingface.co/MaziyarPanahi/calme-2.2-qwen2.5-72b
    - https://huggingface.co/mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF
  description: |
      This model is a fine-tuned version of the powerful Qwen/Qwen2.5-72B-Instruct, pushing the boundaries of natural language understanding and generation even further. My goal was to create a versatile and robust model that excels across a wide range of benchmarks and real-world applications.
      Use Cases
      This model is suitable for a wide range of applications, including but not limited to:
          Advanced question-answering systems
          Intelligent chatbots and virtual assistants
          Content generation and summarization
          Code generation and analysis
          Complex problem-solving and decision support
  overrides:
    parameters:
      model: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
  files:
    - filename: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
      sha256: 5fdfa599724d7c78502c477ced1d294e92781b91d3265bd0748fbf15a6fefde6
      uri: huggingface://mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF/calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
 - &smollm
  ## SmolLM
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
  name: "smollm-1.7b-instruct"
  icon: https://huggingface.co/datasets/HuggingFaceTB/images/resolve/main/banner_smol.png
@@ -1555,6 +1713,27 @@
    - filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
      sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3
      uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf
 - !!merge <<: *mistral03
  name: "magnusintellectus-12b-v1-i1"
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/66b564058d9afb7a9d5607d5/hUVJI1Qa4tCMrZWMgYkoD.png
  urls:
    - https://huggingface.co/GalrionSoftworks/MagnusIntellectus-12B-v1
    - https://huggingface.co/mradermacher/MagnusIntellectus-12B-v1-i1-GGUF
  description: |
    How pleasant, the rocks appear to have made a decent conglomerate. A-.
    MagnusIntellectus is a merge of the following models using LazyMergekit:
        UsernameJustAnother/Nemo-12B-Marlin-v5
        anthracite-org/magnum-12b-v2
  overrides:
    parameters:
      model: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
  files:
    - filename: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
      sha256: c97107983b4edc5b6f2a592d227ca2dd4196e2af3d3bc0fe6b7a8954a1fb5870
      uri: huggingface://mradermacher/MagnusIntellectus-12B-v1-i1-GGUF/MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
 - &mudler
  ### START mudler's LocalAI specific-models
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -51,4 +51,6 @@ type Backend interface {
 	StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts ...grpc.CallOption) (*pb.StoresFindResult, error)
 	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
 	GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)
 }
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -374,3 +374,21 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.
 	client := pb.NewBackendClient(conn)
 	return client.Rerank(ctx, in, opts...)
 }
 func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
 	if !c.parallel {
 		c.opMutex.Lock()
 		defer c.opMutex.Unlock()
 	}
 	c.setBusy(true)
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
 	}
 	defer conn.Close()
 	client := pb.NewBackendClient(conn)
 	return client.GetMetrics(ctx, in, opts...)
 }
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -87,6 +87,10 @@ func (e *embedBackend) Rerank(ctx context.Context, in *pb.RerankRequest, opts ..
 	return e.s.Rerank(ctx, in)
 }
 func (e *embedBackend) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
 	return e.s.GetMetrics(ctx, in)
 }
 type embedBackendServerStream struct {
 	ctx context.Context
 	fn  func(s []byte)
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -144,6 +144,8 @@ func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictS
 	}()
 	err := s.llm.PredictStream(in, resultChan)
 	// close the channel, so if resultChan is not closed by the LLM (maybe because does not implement PredictStream), the client will not hang
 	close(resultChan)
 	<-done
 	return err
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -268,10 +268,10 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (*Model, error) {
+func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
-	return func(modelName, modelFile string) (*Model, error) {
+	return func(modelID, modelName, modelFile string) (*Model, error) {
-		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
+		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
 		var client *Model
@@ -304,18 +304,19 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 					return nil, fmt.Errorf("failed allocating free ports: %s", err.Error())
 				}
 				// Make sure the process is executable
-				if err := ml.startProcess(uri, o.model, serverAddress); err != nil {
+				process, err := ml.startProcess(uri, modelID, serverAddress)
 				if err != nil {
 					log.Error().Err(err).Str("path", uri).Msg("failed to launch ")
 					return nil, err
 				}
 				log.Debug().Msgf("GRPC Service Started")
-				client = NewModel(serverAddress)
+				client = NewModel(modelID, serverAddress, process)
 			} else {
-				log.Debug().Msg("external backend is uri")
+				log.Debug().Msg("external backend is a uri")
 				// address
-				client = NewModel(uri)
+				client = NewModel(modelID, uri, nil)
 			}
 		} else {
 			grpcProcess := backendPath(o.assetDir, backend)
@@ -346,13 +347,14 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			args, grpcProcess = library.LoadLDSO(o.assetDir, args, grpcProcess)
 			// Make sure the process is executable in any circumstance
-			if err := ml.startProcess(grpcProcess, o.model, serverAddress, args...); err != nil {
+			process, err := ml.startProcess(grpcProcess, modelID, serverAddress, args...)
 			if err != nil {
 				return nil, err
 			}
 			log.Debug().Msgf("GRPC Service Started")
-			client = NewModel(serverAddress)
+			client = NewModel(modelID, serverAddress, process)
 		}
 		log.Debug().Msgf("Wait for the service to start up")
@@ -374,6 +376,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		if !ready {
 			log.Debug().Msgf("GRPC Service NOT ready")
 			if process := client.Process(); process != nil {
 				process.Stop()
 			}
 			return nil, fmt.Errorf("grpc service not ready")
 		}
@@ -385,9 +390,15 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, &options)
 		if err != nil {
 			if process := client.Process(); process != nil {
 				process.Stop()
 			}
 			return nil, fmt.Errorf("could not load model: %w", err)
 		}
 		if !res.Success {
 			if process := client.Process(); process != nil {
 				process.Stop()
 			}
 			return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
 		}
@@ -402,11 +413,7 @@ func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error)
 func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
 	o := NewOptions(opts...)
-	if o.model != "" {
+	log.Info().Msgf("Loading model '%s' with backend %s", o.modelID, o.backendString)
 		log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
 	} else {
 		log.Info().Msgf("Loading model with backend %s", o.backendString)
 	}
 	backend := strings.ToLower(o.backendString)
 	if realBackend, exists := Aliases[backend]; exists {
@@ -415,11 +422,10 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 	}
 	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.model)
+		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.model))
+		err := ml.StopGRPC(allExcept(o.modelID))
 		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel")
+			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
 			return nil, err
 		}
 	}
@@ -433,7 +439,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		backendToConsume = backend
 	}
-	model, err := ml.LoadModel(o.model, ml.grpcModel(backendToConsume, o))
+	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
 	if err != nil {
 		return nil, err
 	}
@@ -446,18 +452,18 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	// Return earlier if we have a model already loaded
 	// (avoid looping through all the backends)
-	if m := ml.CheckIsLoaded(o.model); m != nil {
+	if m := ml.CheckIsLoaded(o.modelID); m != nil {
-		log.Debug().Msgf("Model '%s' already loaded", o.model)
+		log.Debug().Msgf("Model '%s' already loaded", o.modelID)
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}
 	// If we can have only one backend active, kill all the others (except external backends)
 	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.model)
+		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.model))
+		err := ml.StopGRPC(allExcept(o.modelID))
 		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
 		}
 	}
@@ -476,23 +482,13 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
-	if o.model != "" {
+	log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.modelID, autoLoadBackends)
 		log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends)
 	}
 	for _, key := range autoLoadBackends {
 		log.Info().Msgf("[%s] Attempting to load", key)
-		options := []Option{
+		options := append(opts, []Option{
 			WithBackendString(key),
-			WithModel(o.model),
+		}...)
 			WithLoadGRPCLoadModelOpts(o.gRPCOptions),
 			WithThreads(o.threads),
 			WithAssetDir(o.assetDir),
 		}
 		for k, v := range o.externalBackends {
 			options = append(options, WithExternalBackend(k, v))
 		}
 		model, modelerr := ml.BackendLoader(options...)
 		if modelerr == nil && model != nil {
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -13,7 +13,6 @@ import (
 	"github.com/mudler/LocalAI/pkg/utils"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )
@@ -24,7 +23,6 @@ type ModelLoader struct {
 	ModelPath string
 	mu        sync.Mutex
 	models    map[string]*Model
 	grpcProcesses map[string]*process.Process
 	templates *templates.TemplateCache
 	wd        *WatchDog
 }
@@ -34,7 +32,6 @@ func NewModelLoader(modelPath string) *ModelLoader {
 		ModelPath: modelPath,
 		models:    make(map[string]*Model),
 		templates: templates.NewTemplateCache(modelPath),
 		grpcProcesses: make(map[string]*process.Process),
 	}
 	return nml
@@ -105,21 +102,21 @@ FILE:
 	return models, nil
 }
-func (ml *ModelLoader) ListModels() []*Model {
+func (ml *ModelLoader) ListModels() []Model {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
-	models := []*Model{}
+	models := []Model{}
 	for _, model := range ml.models {
-		models = append(models, model)
+		models = append(models, *model)
 	}
 	return models
 }
-func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (*Model, error)) (*Model, error) {
+func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string, string, string) (*Model, error)) (*Model, error) {
 	// Check if we already have a loaded model
-	if model := ml.CheckIsLoaded(modelName); model != nil {
+	if model := ml.CheckIsLoaded(modelID); model != nil {
 		return model, nil
 	}
@@ -127,18 +124,18 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
-	model, err := loader(modelName, modelFile)
+	ml.mu.Lock()
 	defer ml.mu.Unlock()
 	model, err := loader(modelID, modelName, modelFile)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("failed to load model with internal loader: %s", err)
 	}
 	if model == nil {
 		return nil, fmt.Errorf("loader didn't return a model")
 	}
-	ml.mu.Lock()
+	ml.models[modelID] = model
 	defer ml.mu.Unlock()
 	ml.models[modelName] = model
 	return model, nil
 }
@@ -146,14 +143,13 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
 func (ml *ModelLoader) ShutdownModel(modelName string) error {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
-
+	model, ok := ml.models[modelName]
 	_, ok := ml.models[modelName]
 	if !ok {
 		return fmt.Errorf("model %s not found", modelName)
 	}
 	retries := 1
-	for ml.models[modelName].GRPC(false, ml.wd).IsBusy() {
+	for model.GRPC(false, ml.wd).IsBusy() {
 		log.Debug().Msgf("%s busy. Waiting.", modelName)
 		dur := time.Duration(retries*2) * time.Second
 		if dur > retryTimeout {
@@ -185,8 +181,8 @@ func (ml *ModelLoader) CheckIsLoaded(s string) *Model {
 	if !alive {
 		log.Warn().Msgf("GRPC Model not responding: %s", err.Error())
 		log.Warn().Msgf("Deleting the process in order to recreate it")
-		process, exists := ml.grpcProcesses[s]
+		process := m.Process()
-		if !exists {
+		if process == nil {
 			log.Error().Msgf("Process not found for '%s' and the model is not responding anymore !", s)
 			return m
 		}
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@@ -63,24 +63,24 @@ var _ = Describe("ModelLoader", func() {
 	Context("LoadModel", func() {
 		It("should load a model and keep it in memory", func() {
-			mockModel = model.NewModel("test.model")
+			mockModel = model.NewModel("foo", "test.model", nil)
-			mockLoader := func(modelName, modelFile string) (*model.Model, error) {
+			mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
 				return mockModel, nil
 			}
-			model, err := modelLoader.LoadModel("test.model", mockLoader)
+			model, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
 			Expect(err).To(BeNil())
 			Expect(model).To(Equal(mockModel))
-			Expect(modelLoader.CheckIsLoaded("test.model")).To(Equal(mockModel))
+			Expect(modelLoader.CheckIsLoaded("foo")).To(Equal(mockModel))
 		})
 		It("should return an error if loading the model fails", func() {
-			mockLoader := func(modelName, modelFile string) (*model.Model, error) {
+			mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
 				return nil, errors.New("failed to load model")
 			}
-			model, err := modelLoader.LoadModel("test.model", mockLoader)
+			model, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
 			Expect(err).To(HaveOccurred())
 			Expect(model).To(BeNil())
 		})
@@ -88,18 +88,16 @@ var _ = Describe("ModelLoader", func() {
 	Context("ShutdownModel", func() {
 		It("should shutdown a loaded model", func() {
-			mockModel = model.NewModel("test.model")
+			mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
-
+				return model.NewModel("foo", "test.model", nil), nil
 			mockLoader := func(modelName, modelFile string) (*model.Model, error) {
 				return mockModel, nil
 			}
-			_, err := modelLoader.LoadModel("test.model", mockLoader)
+			_, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
 			Expect(err).To(BeNil())
-			err = modelLoader.ShutdownModel("test.model")
+			err = modelLoader.ShutdownModel("foo")
 			Expect(err).To(BeNil())
-			Expect(modelLoader.CheckIsLoaded("test.model")).To(BeNil())
+			Expect(modelLoader.CheckIsLoaded("foo")).To(BeNil())
 		})
 	})
 })
--- a/pkg/model/model.go
+++ b/pkg/model/model.go
@@ -1,18 +1,32 @@
 package model
-import grpc "github.com/mudler/LocalAI/pkg/grpc"
+import (
 	"sync"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 	process "github.com/mudler/go-processmanager"
 )
 type Model struct {
 	ID      string `json:"id"`
 	address string
 	client  grpc.Backend
 	process *process.Process
 	sync.Mutex
 }
-func NewModel(address string) *Model {
+func NewModel(ID, address string, process *process.Process) *Model {
 	return &Model{
 		ID:      ID,
 		address: address,
 		process: process,
 	}
 }
 func (m *Model) Process() *process.Process {
 	return m.process
 }
 func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
 	if m.client != nil {
 		return m.client
@@ -23,6 +37,8 @@ func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
 		enableWD = true
 	}
 	m.Lock()
 	defer m.Unlock()
 	m.client = grpc.NewClient(m.address, parallel, wd, enableWD)
 	return m.client
 }
--- a/pkg/model/options.go
+++ b/pkg/model/options.go
@@ -9,7 +9,7 @@ import (
 type Options struct {
 	backendString string
 	model         string
-	threads       uint32
+	modelID       string
 	assetDir      string
 	context       context.Context
@@ -68,12 +68,6 @@ func WithLoadGRPCLoadModelOpts(opts *pb.ModelOptions) Option {
 	}
 }
 func WithThreads(threads uint32) Option {
 	return func(o *Options) {
 		o.threads = threads
 	}
 }
 func WithAssetDir(assetDir string) Option {
 	return func(o *Options) {
 		o.assetDir = assetDir
@@ -92,6 +86,12 @@ func WithSingleActiveBackend() Option {
 	}
 }
 func WithModelID(id string) Option {
 	return func(o *Options) {
 		o.modelID = id
 	}
 }
 func NewOptions(opts ...Option) *Options {
 	o := &Options{
 		gRPCOptions:       &pb.ModelOptions{},
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -16,20 +16,36 @@ import (
 )
 func (ml *ModelLoader) deleteProcess(s string) error {
-	if _, exists := ml.grpcProcesses[s]; exists {
+	defer delete(ml.models, s)
-		if err := ml.grpcProcesses[s].Stop(); err != nil {
+
-			log.Error().Err(err).Msgf("(deleteProcess) error while deleting grpc process %s", s)
+	log.Debug().Msgf("Deleting process %s", s)
-		}
+
-	}
+	m, exists := ml.models[s]
-	delete(ml.grpcProcesses, s)
+	if !exists {
-	delete(ml.models, s)
+		log.Error().Msgf("Model does not exist %s", s)
 		// Nothing to do
 		return nil
 	}
 	process := m.Process()
 	if process == nil {
 		log.Error().Msgf("No process for %s", s)
 		// Nothing to do as there is no process
 		return nil
 	}
 	err := process.Stop()
 	if err != nil {
 		log.Error().Err(err).Msgf("(deleteProcess) error while deleting process %s", s)
 	}
 	return err
 }
 func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 	var err error = nil
-	for k, p := range ml.grpcProcesses {
+	for k, m := range ml.models {
-		if filter(k, p) {
+		if filter(k, m.Process()) {
 			e := ml.ShutdownModel(k)
 			err = errors.Join(err, e)
 		}
@@ -44,17 +60,20 @@ func (ml *ModelLoader) StopAllGRPC() error {
 func (ml *ModelLoader) GetGRPCPID(id string) (int, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
-	p, exists := ml.grpcProcesses[id]
+	p, exists := ml.models[id]
 	if !exists {
 		return -1, fmt.Errorf("no grpc backend found for %s", id)
 	}
-	return strconv.Atoi(p.PID)
+	if p.Process() == nil {
 		return -1, fmt.Errorf("no grpc backend found for %s", id)
 	}
 	return strconv.Atoi(p.Process().PID)
 }
-func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string, args ...string) error {
+func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string, args ...string) (*process.Process, error) {
 	// Make sure the process is executable
 	if err := os.Chmod(grpcProcess, 0700); err != nil {
-		return err
+		return nil, err
 	}
 	log.Debug().Msgf("Loading GRPC Process: %s", grpcProcess)
@@ -63,7 +82,7 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 	workDir, err := filepath.Abs(filepath.Dir(grpcProcess))
 	if err != nil {
-		return err
+		return nil, err
 	}
 	grpcControlProcess := process.New(
@@ -79,10 +98,8 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 		ml.wd.AddAddressModelMap(serverAddress, id)
 	}
 	ml.grpcProcesses[id] = grpcControlProcess
 	if err := grpcControlProcess.Run(); err != nil {
-		return err
+		return grpcControlProcess, err
 	}
 	log.Debug().Msgf("GRPC Service state dir: %s", grpcControlProcess.StateDir())
@@ -116,5 +133,5 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 		}
 	}()
-	return nil
+	return grpcControlProcess, nil
 }
--- a/pkg/templates/multimodal.go
+++ b/pkg/templates/multimodal.go
@@ -0,0 +1,24 @@
 package templates
 import (
 	"bytes"
 	"text/template"
 )
 func TemplateMultiModal(templateString string, templateID int, text string) (string, error) {
 	// compile the template
 	tmpl, err := template.New("template").Parse(templateString)
 	if err != nil {
 		return "", err
 	}
 	result := bytes.NewBuffer(nil)
 	// execute the template
 	err = tmpl.Execute(result, struct {
 		ID   int
 		Text string
 	}{
 		ID:   templateID,
 		Text: text,
 	})
 	return result.String(), err
 }
--- a/pkg/templates/multimodal_test.go
+++ b/pkg/templates/multimodal_test.go
@@ -0,0 +1,19 @@
 package templates_test
 import (
 	. "github.com/mudler/LocalAI/pkg/templates" // Update with your module path
 	// Update with your module path
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("EvaluateTemplate", func() {
 	Context("templating simple strings for multimodal chat", func() {
 		It("should template messages correctly", func() {
 			result, err := TemplateMultiModal("[img-{{.ID}}]{{.Text}}", 1, "bar")
 			Expect(err).NotTo(HaveOccurred())
 			Expect(result).To(Equal("[img-1]bar"))
 		})
 	})
 })
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -972,6 +972,14 @@ const docTemplate = `{
                }
            }
        },
        "model.Model": {
            "type": "object",
            "properties": {
                "id": {
                    "type": "string"
                }
            }
        },
        "openai.Assistant": {
            "type": "object",
            "properties": {
@@ -1682,6 +1690,12 @@ const docTemplate = `{
                    "items": {
                        "type": "string"
                    }
                },
                "loaded_models": {
                    "type": "array",
                    "items": {
                        "$ref": "#/definitions/model.Model"
                    }
                }
            }
        },
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -965,6 +965,14 @@
                }
            }
        },
        "model.Model": {
            "type": "object",
            "properties": {
                "id": {
                    "type": "string"
                }
            }
        },
        "openai.Assistant": {
            "type": "object",
            "properties": {
@@ -1675,6 +1683,12 @@
                    "items": {
                        "type": "string"
                    }
                },
                "loaded_models": {
                    "type": "array",
                    "items": {
                        "$ref": "#/definitions/model.Model"
                    }
                }
            }
        },
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -168,6 +168,11 @@ definitions:
          type: string
        type: array
    type: object
  model.Model:
    properties:
      id:
        type: string
    type: object
  openai.Assistant:
    properties:
      created:
@@ -652,6 +657,10 @@ definitions:
        items:
          type: string
        type: array
      loaded_models:
        items:
          $ref: '#/definitions/model.Model'
        type: array
    type: object
  schema.TTSRequest:
    description: TTS request body
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -260,11 +260,9 @@ var _ = Describe("E2E test", func() {
 				resp, err := http.Post(rerankerEndpoint, "application/json", bytes.NewReader(serialized))
 				Expect(err).To(BeNil())
 				Expect(resp).ToNot(BeNil())
 				Expect(resp.StatusCode).To(Equal(200))
 				body, err := io.ReadAll(resp.Body)
-				Expect(err).To(BeNil())
+				Expect(err).ToNot(HaveOccurred())
-				Expect(body).ToNot(BeNil())
+				Expect(resp.StatusCode).To(Equal(200), fmt.Sprintf("body: %s, response: %+v", body, resp))
 				deserializedResponse := schema.JINARerankResponse{}
 				err = json.Unmarshal(body, &deserializedResponse)
`@@ -1,2 +1,2 @@`
	`grpcio==1.66.1`	`grpcio==1.66.2`
	`protobuf`	`protobuf`
`@@ -1,2 +1,2 @@`
	`langchain==0.3.0`	`langchain==0.3.1`
	`openai==1.47.1`	`openai==1.50.2`