chore(deps): bump llama.cpp to '10f2e81809bbb69ecfe64fc8b4686285f84b0c07'

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-03 03:02:38 -05:00 · 2025-03-12 09:12:59 +01:00
99 changed files with 956 additions and 3696 deletions
--- a/.env
+++ b/.env
@@ -29,9 +29,6 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

-# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
-# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
-
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.3
+        uses: securego/gosec@v2.22.0
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/1
+++ b/1
@@ -24,7 +24,6 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
-        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
--- a/17
+++ b/17
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=d6d2c2ab8c8865784ba9fef37f2b2de3f2134d33
+CPPLLAMA_VERSION?=10f2e81809bbb69ecfe64fc8b4686285f84b0c07

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -21,8 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -260,7 +260,11 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a

 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion-ggml
+endif

 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -805,8 +809,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--progress plain \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -814,7 +817,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <h1 align="center">
  <br>
-  <img height="300" src="./core/http/static/logo.png"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
+    LocalAI
 <br>
 </h1>

@@ -47,58 +48,9 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-
-## 📚🆕 Local Stack Family
-
-🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
-
-<table>
-  <tr>
-    <td width="50%" valign="top">
-      <a href="https://github.com/mudler/LocalAGI">
-        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
-      </a>
-    </td>
-    <td width="50%" valign="top">
-      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
-      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
-    </td>
-  </tr>
-  <tr>
-    <td width="50%" valign="top">
-      <a href="https://github.com/mudler/LocalRecall">
-        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
-      </a>
-    </td>
-    <td width="50%" valign="top">
-      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
-      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
-    </td>
-  </tr>
-</table>
-
-## Screenshots
-
-
-| Talk Interface | Generate Audio |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
-
-| Models Overview | Generate Images |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
-
-| Chat Interface | Home |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
-
-| Login | Swarm |
-| --- | --- |
-|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
-
-## 💻 Quickstart
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)

 Run the installer script:

@@ -107,21 +59,17 @@ curl https://localai.io/install.sh | sh
 ```

 Or run with docker:
-
-### CPU only image:
 ```bash
+# CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-```
-### Nvidia GPU:
-```bash
+
+# Nvidia GPU:
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-```
-### CPU and GPU image (bigger size):
-```bash
+
+# CPU and GPU image (bigger size):
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-```
-### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
-```bash
+
+# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```

@@ -140,13 +88,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

-For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -160,6 +105,19 @@ For more information, see [💻 Getting started](https://localai.io/basics/getti

 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+## 🔥🔥 Hot topics (looking for help):
+
+- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
+- Realtime API https://github.com/mudler/LocalAI/issues/3714
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+- Assistant API: https://github.com/mudler/LocalAI/issues/1273
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647
+- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
+
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
+
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -173,10 +131,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

+## 💻 Usage
+
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

 ### 🔗 Community and integrations

--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+embeddings: true
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,57 +1,101 @@
-context_size: 8192
-f16: true
-function:
-  grammar:
-    no_mixed_free_string: true
-    schema_type: llama3.1 # or JSON is supported too (json)
-  response_regex:
-  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
-mmap: true
 name: gpt-4
+mmap: true
 parameters:
-  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
 stopwords:
- <|im_end|>
- <dummy32000>
- <|eot_id|>
- <|end_of_text|>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
-    {{.Input }}
-    <|start_header_id|>assistant<|end_header_id|>
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
-    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-    {{ if .FunctionCall -}}
-    {{ else if eq .RoleName "tool" -}}
-    The Function was executed and the response was:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content -}}
-    {{ else if .FunctionCall -}}
-    {{ range .FunctionCall }}
-    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
-    {{ end }}
-    {{ end -}}
-    <|eot_id|>
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
-    <|start_header_id|>system<|end_header_id|>
-    You are an expert in composing functions. You are given a question and a set of possible functions.
-    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
-    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
-    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
-    You SHOULD NOT include any other text in the response.
-    Here is a list of functions in JSON format that you can invoke.
-    {{toJson .Functions}}
-    <|eot_id|><|start_header_id|>user<|end_header_id|>
-    {{.Input}}
-    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-download_files:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
-  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  function: |-
+    <|im_start|>system
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,49 +1,31 @@
+backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: bakllava.gguf
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2

 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,53 +1,101 @@
-context_size: 4096
-f16: true
-function:
-  capture_llm_results:
-  - (?s)<Thought>(.*?)</Thought>
-  grammar:
-    properties_order: name,arguments
-  json_regex_match:
-  - (?s)<Output>(.*?)</Output>
-  replace_llm_results:
-  - key: (?s)<Thought>(.*?)</Thought>
-    value: ""
-mmap: true
 name: gpt-4
+mmap: true
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
 stopwords:
- <|im_end|>
- <dummy32000>
- </s>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
+    {{- end }}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
-
-download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
-  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
-  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+    <|im_start|>assistant
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,49 +1,35 @@
+backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2

 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,53 +1,103 @@
-context_size: 4096
-f16: true
-function:
-  capture_llm_results:
-  - (?s)<Thought>(.*?)</Thought>
-  grammar:
-    properties_order: name,arguments
-  json_regex_match:
-  - (?s)<Output>(.*?)</Output>
-  replace_llm_results:
-  - key: (?s)<Thought>(.*?)</Thought>
-    value: ""
-mmap: true
 name: gpt-4
+mmap: false
+context_size: 8192
+
+f16: false
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+
 stopwords:
- <|im_end|>
- <dummy32000>
- </s>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
+    {{- end }}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-
-download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
-  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
-  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,50 +1,35 @@
+backend: llama-cpp
 context_size: 4096
-f16: true
-mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmap: false
+f16: false
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
-
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl" \
-		-DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl"
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif

 llama.cpp:
@@ -84,4 +77,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -467,7 +467,6 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
-    bool has_gpu = false;

    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers;
@@ -509,15 +508,12 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.path.empty()) {
+        if (!params.mmproj.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
-                /* use_gpu */ has_gpu,
-                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
-            });
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }

@@ -531,7 +527,7 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.path.c_str());
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }

@@ -2122,11 +2118,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }

 std::function<void(int)> shutdown_handler;
-
-inline void signal_handler(int signal) {
-    exit(1);
-}
-
+inline void signal_handler(int signal) { shutdown_handler(signal); }

 /////////////////////////////////
 ////////////////////////////////
@@ -2322,15 +2314,15 @@ static std::string get_all_kv_cache_types() {
 }

 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params, llama_server_context &llama) {
+                                common_params & params) {
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

-    params.model.path = request->modelfile();
+    params.model = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
-      params.mmproj.path = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+      params.mmproj = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2360,20 +2352,6 @@ static void params_parse(const backend::ModelOptions* request,
        add_rpc_devices(std::string(llama_grpc_servers));
    }
    
-     // decode options. Options are in form optname:optvale, or if booleans only optname.
-    for (int i = 0; i < request->options_size(); i++) {
-        std::string opt = request->options(i);
-        char *optname = strtok(&opt[0], ":");
-        char *optval = strtok(NULL, ":");
-        if (optval == NULL) {
-            optval = "true";
-        }
-
-        if (!strcmp(optname, "gpu")) {
-            llama.has_gpu = true;
-        }
-    }
-
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
@@ -2405,7 +2383,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2467,7 +2445,7 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
    common_params params;
-    params_parse(request, params, llama);
+    params_parse(request, params);

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2653,20 +2631,6 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
-#elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
+index 7f892beb..0517e529 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+@@ -2766,7 +2766,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+                 int patch_offset = ctx->has_class_embedding ? 1 : 0;
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
                 for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
-+                    patches_data[i] = i;
+-                    patches_data[i] = i + patch_offset;
+                    patches_data[i] = i + 1;
                 }
                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                 free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,5 +1,7 @@
 #!/bin/bash

+set -e
+
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
@@ -21,7 +23,6 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,13 +8,6 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

-GOCMD?=go
-CGO_LDFLAGS?=
-# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
-CGO_LDFLAGS_SYCL=
-GO_TAGS?=
-LD_FLAGS?=
-
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

@@ -28,7 +21,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
@@ -43,35 +36,16 @@ else ifeq ($(OS),Darwin)
 	endif
 endif

-ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DSD_SYCL=ON \
-		-DGGML_SYCL_F16=ON
-	CC=icx
-	CXX=icpx
-	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
-	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
-	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
-endif
+# ifeq ($(BUILD_TYPE),sycl_f16)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
+# endif

-ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DSD_SYCL=ON
-	CC=icx
-	CXX=icpx
-	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
-	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
-	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
-endif
+# ifeq ($(BUILD_TYPE),sycl_f32)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
+# endif

 # warnings
-# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -112,24 +86,11 @@ endif
 	$(MAKE) $(COMBINED_LIB)

 gosd.o:
-ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	+bash -c "source $(ONEAPI_VARS); \
-	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
-else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
-endif

 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

-stablediffusion-ggml:
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
-	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
-ifneq ($(UPX),)
-	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
-endif
-
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc

 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -287,12 +287,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
-            elif request.PipelineType == "Lumina2Text2ImgPipeline":
-                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
-                    request.Model,
-                    torch_dtype=torch.bfloat16)
-                if request.LowVRAM:
-                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}

-	if options.LoadToMemory != nil && !options.SingleBackend {
+	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,7 +53,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,6 +40,10 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

+	if so.SingleBackend {
+		defOpts = append(defOpts, model.WithSingleActiveBackend())
+	}
+
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -117,7 +121,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word: t.Word,
+			Word:    t.Word,
 		})

 	}
@@ -157,33 +161,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:              c.MMProj,
-		FlashAttention:      c.FlashAttention,
-		CacheTypeKey:        c.CacheTypeK,
-		CacheTypeValue:      c.CacheTypeV,
-		NoKVOffload:         c.NoKVOffloading,
-		YarnExtFactor:       c.YarnExtFactor,
-		YarnAttnFactor:      c.YarnAttnFactor,
-		YarnBetaFast:        c.YarnBetaFast,
-		YarnBetaSlow:        c.YarnBetaSlow,
-		NGQA:                c.NGQA,
-		RMSNormEps:          c.RMSNormEps,
-		MLock:               mmlock,
-		RopeFreqBase:        c.RopeFreqBase,
-		RopeScaling:         c.RopeScaling,
-		Type:                c.ModelType,
-		RopeFreqScale:       c.RopeFreqScale,
-		NUMA:                c.NUMA,
-		Embeddings:          embeddings,
-		LowVRAM:             lowVRAM,
-		NGPULayers:          int32(nGPULayers),
-		MMap:                mmap,
-		MainGPU:             c.MainGPU,
-		Threads:             int32(*c.Threads),
-		TensorSplit:         c.TensorSplit,
+		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:               c.MMProj,
+		FlashAttention:       c.FlashAttention,
+		CacheTypeKey:         c.CacheTypeK,
+		CacheTypeValue:       c.CacheTypeV,
+		NoKVOffload:          c.NoKVOffloading,
+		YarnExtFactor:        c.YarnExtFactor,
+		YarnAttnFactor:       c.YarnAttnFactor,
+		YarnBetaFast:         c.YarnBetaFast,
+		YarnBetaSlow:         c.YarnBetaSlow,
+		NGQA:                 c.NGQA,
+		RMSNormEps:           c.RMSNormEps,
+		MLock:                mmlock,
+		RopeFreqBase:         c.RopeFreqBase,
+		RopeScaling:          c.RopeScaling,
+		Type:                 c.ModelType,
+		RopeFreqScale:        c.RopeFreqScale,
+		NUMA:                 c.NUMA,
+		Embeddings:           embeddings,
+		LowVRAM:              lowVRAM,
+		NGPULayers:           int32(nGPULayers),
+		MMap:                 mmap,
+		MainGPU:              c.MainGPU,
+		Threads:              int32(*c.Threads),
+		TensorSplit:          c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,10 +26,10 @@ func SoundGeneration(

 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return "", nil, err
 	}
-	defer loader.Close()

 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,7 +20,6 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac

 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
+
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
-	defer loader.Close()

 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
-	defer ml.Close()

 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return "", nil, err
 	}
-	defer loader.Close()

 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,8 +19,6 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
-	defer ml.Close()
-
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -38,7 +38,7 @@ type RunCMD struct {

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -389,6 +389,16 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}

+	// Value passed by the top level are treated as default (no implicit defaults)
+	// defaults are set by the user
+	if ctx == 0 {
+		ctx = 1024
+	}
+
+	if cfg.ContextSize == nil {
+		cfg.ContextSize = &ctx
+	}
+
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -410,7 +420,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}

-	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
+	guessDefaultsFromFile(cfg, lo.modelPath)
 }

 func (c *BackendConfig) Validate() bool {
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -1,253 +0,0 @@
-package config
-
-import (
-	"strings"
-
-	"github.com/rs/zerolog/log"
-
-	gguf "github.com/thxcode/gguf-parser-go"
-)
-
-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
-const (
-	defaultContextSize = 1024
-)
-
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
-func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
-
-	if defaultCtx == 0 && cfg.ContextSize == nil {
-		ctxSize := f.EstimateLLaMACppUsage().ContextSize
-		if ctxSize > 0 {
-			cSize := int(ctxSize)
-			cfg.ContextSize = &cSize
-		} else {
-			defaultCtx = defaultContextSize
-			cfg.ContextSize = &defaultCtx
-		}
-	}
-
-	if cfg.HasTemplate() {
-		// nothing to guess here
-		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
-		return
-	}
-
-	log.Debug().
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
-		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
-
-	// guess the name
-	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
-	}
-
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
-		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
-	}
-}
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -3,12 +3,147 @@ package config
 import (
 	"os"
 	"path/filepath"
+	"strings"

 	"github.com/rs/zerolog/log"
+
 	gguf "github.com/thxcode/gguf-parser-go"
 )

-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+	DeepSeek2
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+	RepeatPenalty  float64
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		RepeatPenalty: 1.0,
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<start_of_turn>model\n",
+			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
+
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@@ -19,20 +154,106 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
 		return
 	}

-	// We try to guess only if we don't have a template defined already
-	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
-
-	// try to parse the gguf file
-	f, err := gguf.ParseGGUFFile(guessPath)
-	if err == nil {
-		guessGGUFFromFile(cfg, f, defaultCtx)
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 		return
 	}

-	if cfg.ContextSize == nil {
-		if defaultCtx == 0 {
-			defaultCtx = defaultContextSize
+	// We try to guess only if we don't have a template defined already
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+	f, err := gguf.ParseGGUFFile(guessPath)
+	if err != nil {
+		// Only valid for gguf files
+		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
 		}
-		cfg.ContextSize = &defaultCtx
+		if cfg.RepeatPenalty == 0.0 {
+			cfg.RepeatPenalty = settings.RepeatPenalty
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	deepseek2 := arch == "deepseek2"
+
+	switch {
+	case deepseek2:
+		return DeepSeek2
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
 	}
 }
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -142,9 +142,9 @@ func API(application *application.Application) (*fiber.App, error) {
 	httpFS := http.FS(embedDirStatic)

 	router.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
+		URL:        "/favicon.ico",
 		FileSystem: httpFS,
-		File:       "static/favicon.svg",
+		File:       "static/favicon.ico",
 	}))

 	router.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
+			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
+				"class": "relative p-4 w-full max-w-2xl max-h-full",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
+					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
 				},
 				// header
 				elem.Div(
@@ -164,13 +164,14 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
+						"class": "p-4 md:p-5 space-y-4",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
+							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@@ -231,6 +232,7 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
+
 }

 func modelDescription(m *gallery.GalleryModel) elem.Node {
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,7 +21,6 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -49,7 +48,6 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -71,7 +69,6 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -103,7 +100,6 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath, false)
+	var ml = model.NewModelLoader(modelPath)

 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
 	httpFS := http.FS(embedDirStatic)

 	app.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
+		URL:        "/favicon.ico",
 		FileSystem: httpFS,
-		File:       "static/favicon.svg",
+		File:       "static/favicon.ico",
 	}))

 	app.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -50,10 +50,11 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)

 	// Stores
-	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))
+	sl := model.NewModelLoader("")
+	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))

 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
--- a/core/http/static/favicon.ico
+++ b/core/http/static/favicon.ico
--- a/core/http/static/favicon.svg
+++ b/core/http/static/favicon.svg
--- a/core/http/static/logo.png
+++ b/core/http/static/logo.png
--- a/core/http/static/logo_horizontal.png
+++ b/core/http/static/logo_horizontal.png
--- a/core/http/views/login.html
+++ b/core/http/views/login.html
@@ -12,7 +12,7 @@
        <div class="max-w-md w-full bg-gray-800/90 border border-gray-700/50 rounded-xl overflow-hidden shadow-xl">
            <div class="animation-container">
                <div class="text-overlay">
-                    <img src="static/logo.png" alt="LocalAI Logo" class="h-32">
+                <!--    <i class="fas fa-circle-nodes text-5xl text-blue-400 mb-2"></i> -->
                </div>
            </div>
            
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -3,7 +3,7 @@
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>{{.Title}}</title>
  <base href="{{.BaseURL}}" />
-  <link rel="shortcut icon" href="static/favicon.svg" type="image/svg">
+  <link rel="icon" type="image/x-icon" href="favicon.ico" />
  <link rel="stylesheet" href="static/assets/highlightjs.css" />
  <script defer src="static/assets/highlightjs.js"></script>
  <script defer src="static/assets/alpine.js"></script>
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -4,9 +4,10 @@
            <div class="flex items-center">
                <!-- Logo Image -->
                <a href="./" class="flex items-center group">
-                    <img src="static/logo_horizontal.png" 
+                    <img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" 
                         alt="LocalAI Logo" 
-                         class="h-14 mr-3 brightness-110 transition-all duration-300 group-hover:brightness-125">
+                         class="h-10 mr-3 rounded-lg border border-blue-600/30 shadow-md transition-all duration-300 group-hover:shadow-blue-500/20 group-hover:border-blue-500/50">
+                    <span class="text-white text-xl font-bold bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">LocalAI</span>
                </a>
            </div>
            
--- a/core/http/views/partials/navbar_explorer.html
+++ b/core/http/views/partials/navbar_explorer.html
@@ -4,9 +4,10 @@
            <div class="flex items-center">
                <!-- Logo Image -->
                <a href="./" class="flex items-center group">
-                    <img src="static/logo_horizontal.png" 
+                    <img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" 
                         alt="LocalAI Logo" 
                         class="h-10 mr-3 rounded-lg border border-blue-600/30 shadow-md transition-all duration-300 group-hover:shadow-blue-500/20 group-hover:border-blue-500/50">
+                    <span class="text-white text-xl font-bold bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">LocalAI</span>
                </a>
            </div>
            
--- a/docs/assets/images/imagen.png
+++ b/docs/assets/images/imagen.png
--- a/docs/assets/images/localai_screenshot.png
+++ b/docs/assets/images/localai_screenshot.png
--- a/docs/assets/images/logos/logo.png
+++ b/docs/assets/images/logos/logo.png
--- a/docs/assets/images/logos/logo.svg
+++ b/docs/assets/images/logos/logo.svg
--- a/docs/assets/images/screenshots/screenshot_chat.png
+++ b/docs/assets/images/screenshots/screenshot_chat.png
--- a/docs/assets/images/screenshots/screenshot_gallery.png
+++ b/docs/assets/images/screenshots/screenshot_gallery.png
--- a/docs/assets/images/screenshots/screenshot_home.png
+++ b/docs/assets/images/screenshots/screenshot_home.png
--- a/docs/assets/images/screenshots/screenshot_image.png
+++ b/docs/assets/images/screenshots/screenshot_image.png
--- a/docs/assets/images/screenshots/screenshot_login.png
+++ b/docs/assets/images/screenshots/screenshot_login.png
--- a/docs/assets/images/screenshots/screenshot_p2p.png
+++ b/docs/assets/images/screenshots/screenshot_p2p.png
--- a/docs/assets/images/screenshots/screenshot_talk.png
+++ b/docs/assets/images/screenshots/screenshot_talk.png
--- a/docs/assets/images/screenshots/screenshot_tts.png
+++ b/docs/assets/images/screenshots/screenshot_tts.png
--- a/docs/assets/jsconfig.json
+++ b/docs/assets/jsconfig.json
@@ -3,7 +3,7 @@
  "baseUrl": ".",
  "paths": {
   "*": [
-    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/gohugoio/hugo-mod-jslibs-dist/popperjs/v2@v2.21100.20000/package/dist/cjs/*",
+    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/gohugoio/hugo-mod-jslibs-dist/popperjs/v2@v2.21100.20000/package/dist/cjs/popper.js/*",
    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/twbs/bootstrap@v5.3.2+incompatible/js/*"
   ]
  }
--- a/docs/config.toml
+++ b/docs/config.toml
@@ -48,9 +48,9 @@ defaultContentLanguage = 'en'

    [params.docs] # Parameters for the /docs 'template'

-        logo = "https://raw.githubusercontent.com/mudler/LocalAI/refs/heads/master/core/http/static/logo.png"
-        logo_text = ""
-        title           = "LocalAI"           # default html title for documentation pages/sections
+        logo = "https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"
+        logo_text = "LocalAI"
+        title           = "LocalAI documentation"           # default html title for documentation pages/sections

        pathName        = "docs"                            # path name for documentation site | default "docs"

@@ -108,7 +108,6 @@ defaultContentLanguage = 'en'
        # indexName = "" # Index Name to perform search on (or set env variable HUGO_PARAM_DOCSEARCH_indexName)

    [params.analytics] # Parameters for Analytics (Google, Plausible)
-        # google = "G-XXXXXXXXXX" # Replace with your Google Analytics ID
        # plausibleURL    = "/docs/s" # (or set via env variable HUGO_PARAM_ANALYTICS_plausibleURL)
        # plausibleAPI    = "/docs/s" # optional - (or set via env variable HUGO_PARAM_ANALYTICS_plausibleAPI)
        # plausibleDomain = ""      # (or set via env variable HUGO_PARAM_ANALYTICS_plausibleDomain)
@@ -152,7 +151,7 @@ defaultContentLanguage = 'en'

 [languages]
  [languages.en]
-    title = "LocalAI"
+    title = "LocalAI documentation"
    languageName = "English"
    weight = 10
 #  [languages.fr]
--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -13,8 +13,6 @@ LocalAI supports two modes of distributed inferencing via p2p:
 - **Federated Mode**: Requests are shared between the cluster and routed to a single worker node in the network based on the load balancer's decision.
 - **Worker Mode** (aka "model sharding" or "splitting weights"): Requests are processed by all the workers which contributes to the final inference result (by sharing the model weights).

-A list of global instances shared by the community is available at [explorer.localai.io](https://explorer.localai.io).
-
 ## Usage

 Starting LocalAI with `--p2p` generates a shared token for connecting multiple instances: and that's all you need to create AI clusters, eliminating the need for intricate network setups. 
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -18,45 +18,14 @@ To access the WebUI with an API_KEY, browser extensions such as [Requestly](http

 {{% /alert %}}

-## Quickstart
+## Using the Bash Installer

+Install LocalAI easily using the bash installer with the following command:

-### Using the Bash Installer
-```bash
+```sh
 curl https://localai.io/install.sh | sh
 ```

-### Run with docker:
-```bash
-# CPU only image:
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-
-# Nvidia GPU:
-docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-
-# CPU and GPU image (bigger size):
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-```
-
-### Load models:
-
-```bash
-# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
-local-ai run llama-3.2-1b-instruct:q4_k_m
-# Start LocalAI with the phi-2 model directly from huggingface
-local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
-# Install and run a model from the Ollama OCI registry
-local-ai run ollama://gemma:2b
-# Run a model from a configuration file
-local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
-# Install and run a model from a standard OCI registry (e.g., Docker Hub)
-local-ai run oci://localai/phi-2:latest
-```
-
-
 For a full list of options, refer to the [Installer Options]({{% relref "docs/advanced/installer" %}}) documentation.

 Binaries can also be [manually downloaded]({{% relref "docs/reference/binaries" %}}).
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@@ -1,3 +1,4 @@
+
 +++
 title = "Overview"
 weight = 1
@@ -6,96 +7,162 @@ description = "What is LocalAI?"
 tags = ["Beginners"]
 categories = [""]
 author = "Ettore Di Giacinto"
+# This allows to overwrite the landing page
+url = '/'
 icon = "info"
 +++

-# Welcome to LocalAI
+<p align="center">
+<a href="https://localai.io"><img width=512 src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"></a>
+</p               >

-LocalAI is your complete AI stack for running AI models locally. It's designed to be simple, efficient, and accessible, providing a drop-in replacement for OpenAI's API while keeping your data private and secure.
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>

-## Why LocalAI?
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

-In today's AI landscape, privacy, control, and flexibility are paramount. LocalAI addresses these needs by:
+<p align="center">
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>

- **Privacy First**: Your data never leaves your machine
- **Complete Control**: Run models on your terms, with your hardware
- **Open Source**: MIT licensed and community-driven
- **Flexible Deployment**: From laptops to servers, with or without GPUs
- **Extensible**: Add new models and features as needed
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>
+</p>

-## Core Components
+> 💡 Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [💭Discord](https://discord.gg/uJAeKSAGDy)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 

-LocalAI is more than just a single tool - it's a complete ecosystem:

-1. **[LocalAI Core](https://github.com/mudler/LocalAI)**
-   - OpenAI-compatible API
-   - Multiple model support (LLMs, image, audio)
-   - No GPU required
-   - Fast inference with native bindings
-   - [Github repository](https://github.com/mudler/LocalAI)
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-2. **[LocalAGI](https://github.com/mudler/LocalAGI)**
-   - Autonomous AI agents
-   - No coding required
-   - WebUI and REST API support
-   - Extensible agent framework
-   - [Github repository](https://github.com/mudler/LocalAGI)

-3. **[LocalRecall](https://github.com/mudler/LocalRecall)**
-   - Semantic search
-   - Memory management
-   - Vector database
-   - Perfect for AI applications
-   - [Github repository](https://github.com/mudler/LocalRecall)
+## Start LocalAI

-## Getting Started
+Start the image with Docker to have a functional clone of OpenAI! 🚀:

-The fastest way to get started is with our one-line installer:
+```bash
+docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
+# Do you have a Nvidia GPUs? Use this instead
+# CUDA 11
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-11
+# CUDA 12
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
+```
+
+Or just use the bash installer:

 ```bash
 curl https://localai.io/install.sh | sh
 ```

-Or use Docker for a quick start:
+See the [💻 Quickstart](https://localai.io/basics/getting_started/) for all the options and way you can run LocalAI!

-```bash
-docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
-```
+## What is LocalAI?

-For more detailed installation options and configurations, see our [Getting Started guide](/basics/getting_started/).
+In a nutshell:

-## Key Features
+- Local, OpenAI drop-in alternative REST API. You own your data.
+- NO GPU required. NO Internet access is required either
+  - Optional, GPU Acceleration is available. See also the [build section](https://localai.io/basics/build/index.html).
+- Supports multiple models
+- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
+- ⚡ Doesn't shell-out, but uses bindings for a faster inference and better performance.

- **Text Generation**: Run various LLMs locally
- **Image Generation**: Create images with stable diffusion
- **Audio Processing**: Text-to-speech and speech-to-text
- **Vision API**: Image understanding and analysis
- **Embeddings**: Vector database support
- **Functions**: OpenAI-compatible function calling
- **P2P**: Distributed inference capabilities
+LocalAI is focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!

-## Community and Support
+Note that this started just as a fun weekend project by [mudler](https://github.com/mudler) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!

-LocalAI is a community-driven project. You can:
+### 🚀 Features

- Join our [Discord community](https://discord.gg/uJAeKSAGDy)
- Check out our [GitHub repository](https://github.com/mudler/LocalAI)
- Contribute to the project
- Share your use cases and examples
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
+- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 💾 [Stores](https://localai.io/stores)
+- 📈 [Reranker](https://localai.io/features/reranker/)
+- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)

-## Next Steps
+## Contribute and help

-Ready to dive in? Here are some recommended next steps:
+To help the project you can:

-1. [Install LocalAI](/basics/getting_started/)
-2. [Explore available models](https://models.localai.io)
-3. [Model compatibility](/model-compatibility/)
-4. [Try out examples](https://github.com/mudler/LocalAI-examples)
-5. [Join the community](https://discord.gg/uJAeKSAGDy)
-6. [Check the LocalAI Github repository](https://github.com/mudler/LocalAI)
-7. [Check the LocalAGI Github repository](https://github.com/mudler/LocalAGI)
+- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.

+- If you don't have technological skills you can still help improving documentation or [add examples](https://github.com/go-skynet/LocalAI/tree/master/examples) or share your user-stories with our community, any help and contribution is welcome!

-## License
+## 🌟 Star history

-LocalAI is MIT licensed, created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
+
+## ❤️ Sponsors
+
+> Do you find LocalAI useful?
+
+Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
+
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
+
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img width=200 src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img  width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>
+
+## 📖 License
+
+LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).
+
+MIT - Author Ettore Di Giacinto
+
+## 🙇 Acknowledgements
+
+LocalAI couldn't have been built without the help of great software already available from the community. Thank you!
+
+- [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- https://github.com/tatsu-lab/stanford_alpaca
+- https://github.com/cornelk/llama-go for the initial ideas
+- https://github.com/antimatter15/alpaca.cpp
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/ggerganov/whisper.cpp
+- https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper
+
+## 🤗 Contributors
+
+This is a community project, a special thanks to our contributors! 🤗
+<a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
+</a>
--- a/docs/data/landing.yaml
+++ b/docs/data/landing.yaml
@@ -2,212 +2,38 @@

 # Hero
 hero:
-  enable: true
+  enable: false
  weight: 10
  template: hero

-  backgroundImage:
-    path: "images/templates/hero"
-    filename:
-      desktop: "gradient-desktop.webp"
-      mobile: "gradient-mobile.webp"
-
-  badge:
-    text: "⭐ 31.8k+ stars on GitHub!"
-    color: primary
-    pill: false
-    soft: true
-
-  titleLogo:
-    path: "images/logos"
-    filename: "logo.png"
-    alt: "LocalAI Logo"
-    height: 340px
-
-  title: ""
-  subtitle: |
-    **The free, OpenAI, Anthropic alternative. Your All-in-One Complete AI Stack** - Run powerful language models, autonomous agents, and document intelligence **locally** on your hardware. 
-    
-    **No cloud, no limits, no compromise.**
-
-  image:
-    path: "images"
-    filename: "localai_screenshot.png"
-    alt: "LocalAI Screenshot"
-    boxShadow: true
-    rounded: true
-
-  ctaButton:
-    icon: rocket_launch
-    btnText: "Get Started"
-    url: "/basics/getting_started/"
-  cta2Button:
-    icon: code
-    btnText: "View on GitHub"
-    url: "https://github.com/mudler/LocalAI"
-
-  info: |
-    **Drop-in replacement for OpenAI API** - modular suite of tools that work seamlessly together or independently. 
-    
-    Start with **[LocalAI](https://localai.io)**'s OpenAI-compatible API, extend with **[LocalAGI](https://github.com/mudler/LocalAGI)**'s autonomous agents, and enhance with **[LocalRecall](https://github.com/mudler/LocalRecall)**'s semantic search - all running locally on your hardware.
-
-    **Open Source** MIT Licensed.
-
 # Feature Grid
 featureGrid:
-  enable: true
+  enable: false
  weight: 20
  template: feature grid

-  title: Why choose LocalAI?
-  subtitle: |
-    **OpenAI API Compatible** - Run AI models locally with our modular ecosystem. From language models to autonomous agents and semantic search, build your complete AI stack without the cloud.
-
-  items:
-    - title: LLM Inferencing
-      icon: memory_alt
-      description: LocalAI is a free, **Open Source** OpenAI alternative. Run **LLMs**, generate **images**, **audio** and more **locally** with consumer grade hardware.
-      ctaLink:
-        text: learn more
-        url: /basics/getting_started/
-    - title: Agentic-first
-      icon: smart_toy
-      description: |
-        Extend LocalAI with LocalAGI, an autonomous AI agent platform that runs locally, no coding required. 
-        Build and deploy autonomous agents with ease. Interact with REST APIs or use the WebUI.
-      ctaLink:
-        text: learn more
-        url: https://github.com/mudler/LocalAGI
-
-    - title: Memory and Knowledge base
-      icon: psychology
-      description: 
-        Extend LocalAI with LocalRecall, A local rest api for semantic search and memory management. Perfect for AI applications.
-      ctaLink:
-        text: learn more
-        url: https://github.com/mudler/LocalRecall
-
-    - title: OpenAI Compatible
-      icon: api
-      description: Drop-in replacement for OpenAI API. Compatible with existing applications and libraries.
-      ctaLink:
-        text: learn more
-        url: /basics/getting_started/
-
-    - title: No GPU Required
-      icon: memory
-      description: Run on consumer grade hardware. No need for expensive GPUs or cloud services.
-      ctaLink:
-        text: learn more
-        url: /basics/getting_started/
-
-    - title: Multiple Models
-      icon: hub
-      description: |
-          Support for various model families including LLMs, image generation, and audio models.
-          Supports multiple backends for inferencing, including vLLM, llama.cpp, and more.
-          You can switch between them as needed and install them from the Web interface or the CLI.
-      ctaLink:
-        text: learn more
-        url: /model-compatibility
-
-    - title: Privacy Focused
-      icon: security
-      description: Keep your data local. No data leaves your machine, ensuring complete privacy.
-      ctaLink:
-        text: learn more
-        url: /basics/container/
-
-    - title: Easy Setup
-      icon: settings
-      description: Simple installation and configuration. Get started in minutes with Binaries installation, Docker, Podman, Kubernetes or local installation.
-      ctaLink:
-        text: learn more
-        url: /basics/getting_started/
-
-    - title: Community Driven
-      icon: groups
-      description: Active community support and regular updates. Contribute and help shape the future of LocalAI.
-      ctaLink:
-        text: learn more
-        url: https://github.com/mudler/LocalAI
-
-
-
-    - title: Extensible
-      icon: extension
-      description: Easy to extend and customize. Add new models and features as needed.
-      ctaLink:
-        text: learn more
-        url: /docs/integrations/
-
-    - title: Peer 2 Peer
-      icon: hub
-      description: |
-        LocalAI is designed to be a decentralized LLM inference, powered by a peer-to-peer system based on libp2p. 
-        It is designed to be used in a local or remote network, and is compatible with any LLM model. 
-        It works both in federated mode or by splitting models weights.
-      ctaLink:
-        text: learn more
-        url: /features/distribute/
-
-    - title: Open Source
-      icon: code
-      description: MIT licensed. Free to use, modify, and distribute. Community contributions welcome.
-      ctaLink:
-        text: learn more
-        url: https://github.com/mudler/LocalAI
-
 imageText:
  enable: true
  weight: 25
  template: image text

-  title: Run AI models locally with ease
-  subtitle: |
-    LocalAI makes it simple to run various AI models on your own hardware. From text generation to image creation, autonomous agents to semantic search - all orchestrated through a unified API.
+  title: LocalAI
+  subtitle: The Free, Open Source OpenAI Alternative

  list:
-    - text: OpenAI API compatibility
-      icon: api
+    - text: Optimized, fast inference
+      icon: speed

-    - text: Multiple model support
-      icon: hub
+    - text: Comprensive support for many models architectures
+      icon: area_chart

-    - text: Image understanding
-      icon: image
-    
-    - text: Image generation
-      icon: image
-
-    - text: Audio generation
-      icon: music_note
-
-    - text: Voice activity detection
-      icon: mic
-
-    - text: Speech recognition
-      icon: mic
-
-    - text: Video generation
-      icon: movie
-
-    - text: Privacy focused
-      icon: security
-
-    - text: Autonomous agents with [LocalAGI](https://github.com/mudler/LocalAGI)
-      icon: smart_toy
-
-    - text: Semantic search with [LocalRecall](https://github.com/mudler/LocalRecall)
-      icon: psychology
-
-    - text: Agent orchestration
-      icon: hub
+    - text: Easy to deploy with Docker
+      icon: accessibility

  image:
-    path: "images"
-    filename: "imagen.png"
-    alt: "LocalAI Image generation"
+    path: "images/logos"
+    filename: "logo.png"
+    alt: "LocalAI logo" # Optional but recommended

  imgOrder:
    desktop: 2
@@ -215,62 +41,10 @@ imageText:

  ctaButton:
    text: Learn more
-    url: "/basics/getting_started/"
+    url: "/docs/"

 # Image compare
 imageCompare:
  enable: false
  weight: 30
  template: image compare
-
-  title: LocalAI in Action
-  subtitle: See how LocalAI can transform your local AI experience with various models and capabilities.
-
-  items:
-    - title: Text Generation
-      config: {
-        startingPoint: 50,
-        addCircle: true,
-        addCircleBlur: false,
-        showLabels: true,
-        labelOptions: {
-          before: 'Dark',
-          after: 'Light',
-          onHover: false
-        }
-      }
-      imagePath: "images/screenshots"
-      imageBefore: "text_generation_input.webp"
-      imageAfter: "text_generation_output.webp"
-
-    - title: Image Generation
-      config: {
-        startingPoint: 50,
-        addCircle: true,
-        addCircleBlur: true,
-        showLabels: true,
-        labelOptions: {
-          before: 'Prompt',
-          after: 'Result',
-          onHover: true
-        }
-      }
-      imagePath: "images/screenshots"
-      imageBefore: "imagen_before.webp"
-      imageAfter: "imagen_after.webp"
-
-    - title: Audio Generation
-      config: {
-        startingPoint: 50,
-        addCircle: true,
-        addCircleBlur: false,
-        showLabels: true,
-        labelOptions: {
-          before: 'Text',
-          after: 'Audio',
-          onHover: false
-        }
-      }
-      imagePath: "images/screenshots"
-      imageBefore: "audio_generation_text.webp"
-      imageAfter: "audio_generation_waveform.webp"
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.27.0"
+  "version": "v2.26.0"
 }
--- a/docs/layouts/index.html
+++ b/docs/layouts/index.html
--- a/docs/layouts/partials/docs/top-header.html
+++ b/docs/layouts/partials/docs/top-header.html
@@ -82,7 +82,7 @@
                </span>
            </button>
            {{ end -}}
-            {{ if hugo.IsMultilingual }}
+            {{ if .Site.IsMultiLingual }}
                <div class="dropdown">
                    <button class="btn btn-link btn-default dropdown-toggle ps-2" type="button" data-bs-toggle="dropdown" aria-expanded="false">
                        {{ site.Language.Lang | upper }}
--- a/docs/layouts/partials/head.html
+++ b/docs/layouts/partials/head.html
@@ -18,10 +18,10 @@
    <!-- Custom CSS -->
    {{- $options := dict "enableSourceMap" true }}
    {{- if hugo.IsProduction}}
-        {{- $options = dict "enableSourceMap" false "outputStyle" "compressed" }}
+        {{- $options := dict "enableSourceMap" false "outputStyle" "compressed" }}
    {{- end }}
    {{- $style := resources.Get "/scss/style.scss" }}
-    {{- $style = $style | resources.ExecuteAsTemplate "/scss/style.scss" . | css.Sass $options }}
+    {{- $style = $style | resources.ExecuteAsTemplate "/scss/style.scss" . | resources.ToCSS $options }}
    {{- if hugo.IsProduction }}
        {{- $style = $style | minify | fingerprint "sha384" }}
    {{- end -}}
@@ -39,7 +39,7 @@
    <!-- Image Compare Viewer -->
    {{ if ($.Scratch.Get "image_compare_enabled") }}
        {{ $imagecompare := resources.Get "js/image-compare-viewer.min.js" }}
-        {{- if not hugo.IsDevelopment }}
+        {{- if not .Site.IsServer }}
            {{- $js := (slice $imagecompare) | resources.Concat "/js/image-compare.js" | minify | fingerprint "sha384" }}
            <script type="text/javascript" src="{{ $js.Permalink }}" integrity="{{ $js.Data.Integrity }}"></script>
        {{- else }}
@@ -48,14 +48,14 @@
        {{- end }}
    {{- end }}
    <!-- Plausible Analytics Config -->
-    {{- if not hugo.IsDevelopment }}
+    {{- if not .Site.IsServer }}
    {{ if and (.Site.Params.plausible.scriptURL) (.Site.Params.plausible.dataDomain) -}}
        {{- partialCached "head/plausible" . }}
    {{- end -}}
    {{- end -}}
    <!-- Google Analytics v4 Config -->
-    {{- if not hugo.IsDevelopment }}
-    {{- if .Site.Params.analytics.google }}
+    {{- if not .Site.IsServer }}
+    {{- if .Site.GoogleAnalytics }}
        {{- template "_internal/google_analytics.html" . -}}
    {{- end -}}
    {{- end -}}
--- a/docs/layouts/partials/header.html
+++ b/docs/layouts/partials/header.html
@@ -1,57 +0,0 @@
-<!-- Navbar Start -->
-<header id="topnav">
-    <div class="container d-flex justify-content-between align-items-center">
-        <!-- Logo container-->
-        <a class="logo" aria-label="Home" href='{{ relLangURL "" }}'>
-            
-        </a>
-        <!-- End Logo container-->
-
-        <div class="d-flex align-items-center">
-
-            <div id="navigation">
-                <!-- Navigation Menu -->
-                <ul class="navigation-menu nav-right">
-                    {{- range .Site.Menus.primary }}
-                    <li><a href="{{ relLangURL .URL }}">{{ .Name }}</a></li>
-                    {{ end }}
-                </ul><!--end navigation menu-->
-            </div><!--end navigation-->
-
-            <!-- Social Links Start -->
-            {{ with $.Scratch.Get "social_list" }}
-            <ul class="social-link d-flex list-inline mb-0">
-                {{ range . }}
-                    {{ $path := printf "images/social/%s.%s" . "svg" }}
-                    <li class="list-inline-item mb-0">
-                        <a href="{{ if eq . `rss` }} {{ `index.xml` | absURL }} {{ else if eq . `bluesky` }} https://bsky.app/profile/{{ index site.Params.social . }} {{ else }} https://{{ . }}.com/{{ index site.Params.social . }} {{ end }}" alt="{{ . }}" rel="noopener noreferrer" target="_blank">
-                            <div class="btn btn-icon btn-landing border-0">
-                                {{ with resources.Get $path }}
-                                    {{ .Content | safeHTML }}
-                                {{ end }}
-                            </div>
-                        </a>
-                    </li>
-                {{ end }}
-            </ul>
-            {{ end }}
-            <!-- Social Links End -->
-
-            <div class="menu-extras ms-3 me-2">
-                <div class="menu-item">
-                    <!-- Mobile menu toggle-->
-                    <button class="navbar-toggle btn btn-icon btn-soft-light" id="isToggle" aria-label="toggleMenu" onclick="toggleMenu()">
-                        <div class="lines">
-                            <span></span>
-                            <span></span>
-                            <span></span>
-                        </div>
-                    </button>
-                    <!-- End mobile menu toggle-->
-                </div>
-            </div>
-
-        </div>
-    </div><!--end container-->
-</header><!--end header-->
-<!-- Navbar End -->
--- a/docs/layouts/partials/logo.html
+++ b/docs/layouts/partials/logo.html
@@ -1 +1 @@
-<a href="https://localai.io"><img src="https://raw.githubusercontent.com/mudler/LocalAI/refs/heads/master/core/http/static/logo.png"></a>
+<a href="https://localai.io"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"></a>
--- a/docs/netlify.toml
+++ b/docs/netlify.toml
@@ -1,4 +1,4 @@
 [build]
 [build.environment]
-HUGO_VERSION = "0.146.3"
+HUGO_VERSION = "0.121.2"
 GO_VERSION = "1.22.2"
--- a/docs/static/android-chrome-192x192.png
+++ b/docs/static/android-chrome-192x192.png
--- a/docs/static/android-chrome-512x512.png
+++ b/docs/static/android-chrome-512x512.png
--- a/docs/static/apple-touch-icon.png
+++ b/docs/static/apple-touch-icon.png
--- a/docs/static/favicon-16x16.png
+++ b/docs/static/favicon-16x16.png
--- a/docs/static/favicon-32x32.png
+++ b/docs/static/favicon-32x32.png
--- a/docs/static/favicon.ico
+++ b/docs/static/favicon.ico
--- a/docs/static/favicon.svg
+++ b/docs/static/favicon.svg
--- a/docs/static/site.webmanifest
+++ b/docs/static/site.webmanifest
@@ -1 +0,0 @@
-{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
--- a/docs/themes/lotusdocs
+++ b/docs/themes/lotusdocs
--- a/gallery/gemma.yaml
+++ b/gallery/gemma.yaml
@@ -7,33 +7,14 @@ config_file: |
  template:
    chat_message: |-
      <start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
-      {{ if .FunctionCall -}}
-      {{ else if eq .RoleName "tool" -}}
-      {{ end -}}
      {{ if .Content -}}
      {{.Content -}}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
      {{ end -}}<end_of_turn>
    chat: |
      {{.Input }}
      <start_of_turn>model
    completion: |
      {{.Input}}
-    function: |
-      <start_of_turn>system
-      You have access to functions. If you decide to invoke any of the function(s),
-      you MUST put it in the format of
-      {"name": function name, "parameters": dictionary of argument name and its value}
-
-      You SHOULD NOT include any other text in the response if you call a function
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      <end_of_turn>
-      {{.Input -}}
-      <start_of_turn>model
  stopwords:
  - '<|im_end|>'
  - '<end_of_turn>'
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/main.go
+++ b/main.go
@@ -74,9 +74,10 @@ Version: ${version}
 		),
 		kong.UsageOnError(),
 		kong.Vars{
-			"basepath":  kong.ExpandPath("."),
-			"galleries": `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
-			"version":   internal.PrintableVersion(),
+			"basepath":         kong.ExpandPath("."),
+			"remoteLibraryURL": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml",
+			"galleries":        `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
+			"version":          internal.PrintableVersion(),
 		},
 	)

--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -473,6 +473,8 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 		backend = realBackend
 	}

+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
+
 	var backendToConsume string

 	switch backend {
@@ -495,37 +497,17 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 }

 func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
-	if !singleActiveBackend {
-		return
-	}
-
 	// If we can have only one backend active, kill all the others (except external backends)
-
-	// Stop all backends except the one we are going to load
-	log.Debug().Msgf("Stopping all backends except '%s'", modelID)
-	err := ml.StopGRPC(allExcept(modelID))
-	if err != nil {
-		log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+	if singleActiveBackend {
+		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
+		err := ml.StopGRPC(allExcept(modelID))
+		if err != nil {
+			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+		}
 	}
 }

-func (ml *ModelLoader) Close() {
-	if !ml.singletonMode {
-		return
-	}
-	ml.singletonLock.Unlock()
-}
-
-func (ml *ModelLoader) lockBackend() {
-	if !ml.singletonMode {
-		return
-	}
-	ml.singletonLock.Lock()
-}
-
 func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
-	ml.lockBackend() // grab the singleton lock if needed
-
 	o := NewOptions(opts...)

 	// Return earlier if we have a model already loaded
@@ -536,20 +518,17 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}

-	ml.stopActiveBackends(o.modelID, ml.singletonMode)
+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)

-	// if a backend is defined, return the loader directly
 	if o.backendString != "" {
 		return ml.backendLoader(opts...)
 	}

-	// Otherwise scan for backends in the asset directory
 	var err error

 	// get backends embedded in the binary
 	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
 	if err != nil {
-		ml.Close() // we failed, release the lock
 		return nil, err
 	}

@@ -581,7 +560,5 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		}
 	}

-	ml.Close() // make sure to release the lock in case of failure
-
 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
 }
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -18,19 +18,16 @@ import (

 // TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we seperate directories for .bin/.yaml and .tmpl
 type ModelLoader struct {
-	ModelPath     string
-	mu            sync.Mutex
-	singletonLock sync.Mutex
-	singletonMode bool
-	models        map[string]*Model
-	wd            *WatchDog
+	ModelPath string
+	mu        sync.Mutex
+	models    map[string]*Model
+	wd        *WatchDog
 }

-func NewModelLoader(modelPath string, singleActiveBackend bool) *ModelLoader {
+func NewModelLoader(modelPath string) *ModelLoader {
 	nml := &ModelLoader{
-		ModelPath:     modelPath,
-		models:        make(map[string]*Model),
-		singletonMode: singleActiveBackend,
+		ModelPath: modelPath,
+		models:    make(map[string]*Model),
 	}

 	return nml
@@ -145,6 +142,26 @@ func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string,
 func (ml *ModelLoader) ShutdownModel(modelName string) error {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
+	model, ok := ml.models[modelName]
+	if !ok {
+		return fmt.Errorf("model %s not found", modelName)
+	}
+
+	retries := 1
+	for model.GRPC(false, ml.wd).IsBusy() {
+		log.Debug().Msgf("%s busy. Waiting.", modelName)
+		dur := time.Duration(retries*2) * time.Second
+		if dur > retryTimeout {
+			dur = retryTimeout
+		}
+		time.Sleep(dur)
+		retries++
+
+		if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
+			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
+			break
+		}
+	}

 	return ml.deleteProcess(modelName)
 }
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go
@@ -17,9 +17,10 @@ type Options struct {

 	externalBackends map[string]string

-	grpcAttempts      int
-	grpcAttemptsDelay int
-	parallelRequests  bool
+	grpcAttempts        int
+	grpcAttemptsDelay   int
+	singleActiveBackend bool
+	parallelRequests    bool
 }

 type Option func(*Options)
@@ -87,6 +88,12 @@ func WithContext(ctx context.Context) Option {
 	}
 }

+func WithSingleActiveBackend() Option {
+	return func(o *Options) {
+		o.singleActiveBackend = true
+	}
+}
+
 func WithModelID(id string) Option {
 	return func(o *Options) {
 		o.modelID = id
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@@ -21,7 +21,7 @@ var _ = Describe("ModelLoader", func() {
 		// Setup the model loader with a test directory
 		modelPath = "/tmp/test_model_path"
 		os.Mkdir(modelPath, 0755)
-		modelLoader = model.NewModelLoader(modelPath, false)
+		modelLoader = model.NewModelLoader(modelPath)
 	})

 	AfterEach(func() {
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -9,43 +9,25 @@ import (
 	"strconv"
 	"strings"
 	"syscall"
-	"time"

 	"github.com/hpcloud/tail"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )

-var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
-
 func (ml *ModelLoader) deleteProcess(s string) error {
-	model, ok := ml.models[s]
-	if !ok {
-		log.Debug().Msgf("Model %s not found", s)
-		return fmt.Errorf("model %s not found", s)
-	}
-
 	defer delete(ml.models, s)

-	retries := 1
-	for model.GRPC(false, ml.wd).IsBusy() {
-		log.Debug().Msgf("%s busy. Waiting.", s)
-		dur := time.Duration(retries*2) * time.Second
-		if dur > retryTimeout {
-			dur = retryTimeout
-		}
-		time.Sleep(dur)
-		retries++
-
-		if retries > 10 && forceBackendShutdown {
-			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", s, retries)
-			break
-		}
-	}
-
 	log.Debug().Msgf("Deleting process %s", s)

-	process := model.Process()
+	m, exists := ml.models[s]
+	if !exists {
+		log.Error().Msgf("Model does not exist %s", s)
+		// Nothing to do
+		return nil
+	}
+
+	process := m.Process()
 	if process == nil {
 		log.Error().Msgf("No process for %s", s)
 		// Nothing to do as there is no process
@@ -62,12 +44,9 @@ func (ml *ModelLoader) deleteProcess(s string) error {

 func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 	var err error = nil
-	ml.mu.Lock()
-	defer ml.mu.Unlock()
-
 	for k, m := range ml.models {
 		if filter(k, m.Process()) {
-			e := ml.deleteProcess(k)
+			e := ml.ShutdownModel(k)
 			err = errors.Join(err, e)
 		}
 	}
--- a/tests/integration/stores_test.go
+++ b/tests/integration/stores_test.go
@@ -70,7 +70,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				model.WithModel("test"),
 			}

-			sl = model.NewModelLoader("", false)
+			sl = model.NewModelLoader("")
 			sc, err = sl.Load(storeOpts...)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(sc).ToNot(BeNil())
@@ -235,7 +235,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {-1.0, 0.0, 0.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}

-			err := store.SetCols(context.Background(), sc, keys, vals)
+			err := store.SetCols(context.Background(), sc, keys, vals);
 			Expect(err).ToNot(HaveOccurred())

 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@@ -247,7 +247,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 1.0}, {0.0, 2.0, 0.0}, {0.0, 0.0, -1.0}, {-1.0, 0.0, -1.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}

-			err := store.SetCols(context.Background(), sc, keys, vals)
+			err := store.SetCols(context.Background(), sc, keys, vals);
 			Expect(err).ToNot(HaveOccurred())

 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@@ -314,7 +314,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"

 			normalize(keys[6:])

-			err := store.SetCols(context.Background(), sc, keys, vals)
+			err := store.SetCols(context.Background(), sc, keys, vals);
 			Expect(err).ToNot(HaveOccurred())

 			expectTriangleEq(keys, vals)
@@ -341,7 +341,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				c += 1
 			}

-			err := store.SetCols(context.Background(), sc, keys, vals)
+			err := store.SetCols(context.Background(), sc, keys, vals);
 			Expect(err).ToNot(HaveOccurred())

 			expectTriangleEq(keys, vals)
				`@@ -1 +0,0 @@`
				`{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}`