wire to grpc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
wip reranking llama.cpp
2026-02-03 03:02:38 -05:00 · 2025-04-19 20:22:31 +02:00 · 2025-04-19 19:52:02 +02:00 · 2025-04-19 15:52:29 +02:00 · 2025-04-19 08:53:24 +02:00 · 2025-04-18 21:45:48 +00:00
185 changed files with 8279 additions and 3407 deletions
--- a/.env
+++ b/.env
@@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/autogptq"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,7 +9,7 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggerganov/llama.cpp"
+          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "ggerganov/whisper.cpp"
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,6 +75,7 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-hipblas-core'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -251,6 +252,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f16-core'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -261,6 +263,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f32-core'

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -339,6 +342,7 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -351,17 +355,18 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
-            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-vulkan-core'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -310,6 +310,11 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}

+      - name: Cleanup
+        run: |
+          docker builder prune -f
+          docker system prune --force --volumes --all
+
      - name: Latest tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.3
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/6
+++ b/6
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -24,6 +24,7 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
+        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -430,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/autogptq \
-    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/30
+++ b/30
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
+CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -21,8 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -260,11 +260,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a

 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/stablediffusion-ggml
-endif
+	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml

 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -509,18 +505,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
-
-.PHONY: autogptq-protogen
-autogptq-protogen:
-	$(MAKE) -C backend/python/autogptq protogen
-
-.PHONY: autogptq-protogen-clean
-autogptq-protogen-clean:
-	$(MAKE) -C backend/python/autogptq protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@@ -597,7 +585,6 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
-	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@@ -809,7 +796,8 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--progress plain \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -817,7 +805,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
-    LocalAI
+  <img height="300" src="./core/http/static/logo.png"> <br>
 <br>
 </h1>

@@ -48,9 +47,58 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
+## 📚🆕 Local Stack Family
+
+🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
+
+<table>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalAGI">
+        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
+      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
+    </td>
+  </tr>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalRecall">
+        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
+      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
+    </td>
+  </tr>
+</table>
+
+## Screenshots
+
+
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
+
+| Chat Interface | Home |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
+
+## 💻 Quickstart

 Run the installer script:

@@ -59,17 +107,21 @@ curl https://localai.io/install.sh | sh
 ```

 Or run with docker:
+
+### CPU only image:
 ```bash
-# CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-
-# Nvidia GPU:
+```
+### Nvidia GPU:
+```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-
-# CPU and GPU image (bigger size):
+```
+### CPU and GPU image (bigger size):
+```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+```
+### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```

@@ -88,10 +140,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

-[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

+- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -105,19 +160,6 @@ local-ai run oci://localai/phi-2:latest

 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-## 🔥🔥 Hot topics (looking for help):
-
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
-
-If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
-
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -131,12 +173,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

 ### 🔗 Community and integrations

@@ -212,7 +252,7 @@ A huge thank you to our generous sponsors who support this project covering CI e

 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
-name: text-embedding-ada-002
 embeddings: true
+name: text-embedding-ada-002
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,101 +1,57 @@
-name: gpt-4
-mmap: true
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
-  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+    no_mixed_free_string: true
+    schema_type: llama3.1 # or JSON is supported too (json)
+  response_regex:
+  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
+mmap: true
+name: gpt-4
+parameters:
+  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- <|eot_id|>
+- <|end_of_text|>
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input }}
+    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
-    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
-    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    {{ if .FunctionCall -}}
+    {{ else if eq .RoleName "tool" -}}
+    The Function was executed and the response was:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content -}}
+    {{ else if .FunctionCall -}}
+    {{ range .FunctionCall }}
+    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{ end }}
+    {{ end -}}
+    <|eot_id|>
  completion: |
    {{.Input}}
-  function: |-
-    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+  function: |
+    <|start_header_id|>system<|end_header_id|>
+    You are an expert in composing functions. You are given a question and a set of possible functions.
+    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    You SHOULD NOT include any other text in the response.
+    Here is a list of functions in JSON format that you can invoke.
+    {{toJson .Functions}}
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input}}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+download_files:
+- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,31 +1,49 @@
-backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: bakllava-mmproj.gguf
 parameters:
-  model: bakllava.gguf
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant

 download_files:
- filename: bakllava.gguf
-  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
-  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
+embeddings: true
 name: text-embedding-ada-002
-backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,101 +1,53 @@
-name: gpt-4
-mmap: true
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-context_size: 8192
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+context_size: 4096
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
+  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
+    {{ end -}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
+    <|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
+  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,35 +1,49 @@
-backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  temperature: 0.2
-  top_k: 40
-  top_p: 0.95
-  seed: -1
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant

 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
+embeddings: true
 name: text-embedding-ada-002
-backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,103 +1,53 @@
-name: gpt-4
-mmap: false
-context_size: 8192
-
-f16: false
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+context_size: 4096
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
+  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
+    {{ end -}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
+    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
+  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,35 +1,50 @@
-backend: llama-cpp
 context_size: 4096
-mmap: false
-f16: false
+f16: true
+mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  temperature: 0.2
-  top_k: 40
-  top_p: 0.95
-  seed: -1
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+

 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -165,7 +165,6 @@ message Reply {

 message GrammarTrigger {
  string word = 1;
-  bool at_start = 2; 
 }

 message ModelOptions {
@@ -191,11 +190,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
@@ -229,6 +224,11 @@ message ModelOptions {
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
+  bool   DisableLogStatus = 66;
+  string DType = 67;
+  int32  LimitImagePerPrompt = 68;
+  int32  LimitVideoPerPrompt = 69;
+  int32  LimitAudioPerPrompt = 70;

  string MMProj = 41;

--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

 llama.cpp:
@@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -217,6 +217,7 @@ struct llama_client_slot

    bool infill = false;
    bool embedding = false;
+    bool reranker = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -467,9 +468,10 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
+    bool has_gpu = false;

    bool grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_trigger_words;
+    std::vector<common_grammar_trigger> grammar_triggers;

    int32_t n_ctx;  // total context for all clients / slots

@@ -508,12 +510,15 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
+            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
+                /* use_gpu */ has_gpu,
+                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
+            });
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
                return false;
            }

@@ -527,10 +532,16 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERR("unable to load model: %s", params.model.path.c_str());
            return false;
        }

+        // Enable reranking if embeddings are enabled - moved after context initialization
+        if (params.embedding) {
+            params.reranking = true;
+            LOG_INFO("Reranking enabled (embeddings are enabled)", {});
+        }
+
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_model_n_embd(model);
@@ -709,7 +720,7 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-        slot->sparams.grammar_trigger_words = grammar_trigger_words;
+        slot->sparams.grammar_triggers = grammar_triggers;
        slot->sparams.grammar_lazy = grammar_lazy;

        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
@@ -1350,7 +1361,7 @@ struct llama_server_context
        queue_results.send(res);
    }

-    void send_embedding(llama_client_slot &slot)
+    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
    {
        task_result res;
        res.id = slot.task_id;
@@ -1372,16 +1383,96 @@ struct llama_server_context
        else
        {
            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
+            std::vector<float> embd_res(n_embd, 0.0f);
+            std::vector<std::vector<float>> embedding;
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                if (embd == NULL) {
+                    embd = llama_get_embeddings_ith(ctx, i);
+                }
+
+                if (embd == NULL) {
+                    LOG("failed to get embeddings");
+
+                    continue;
+                }
+
+                // normalize only when there is pooling
+                // TODO: configurable
+                if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
+                    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                    embedding.push_back(embd_res);
+                } else {
+                    embedding.push_back({ embd, embd + n_embd });
+                }
+            }
+
+            // OAI compat
            res.result_json = json
            {
-                {"embedding", embedding },
+                {"embedding", embedding[0] },
            };
        }
        queue_results.send(res);
    }

-    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
+    void send_rerank(llama_client_slot &slot, const llama_batch & batch)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = true;
+
+        float score = -1e6f; // Default score if we fail to get embeddings
+
+        if (!params.reranking)
+        {
+            LOG_WARNING("reranking disabled", {
+                {"params.reranking", params.reranking},
+            });
+        }
+        else if (ctx == nullptr)
+        {
+            LOG_ERR("context is null, cannot perform reranking");
+            res.error = true;
+        }
+        else
+        {
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                if (embd == NULL) {
+                    embd = llama_get_embeddings_ith(ctx, i);
+                }
+
+                if (embd == NULL) {
+                    LOG("failed to get embeddings");
+                    continue;
+                }
+
+                score = embd[0];
+            }
+        }
+
+        // Format result as JSON similar to the embedding function
+        res.result_json = json
+        {
+            {"score", score},
+            {"tokens", slot.num_prompt_tokens}
+        };
+        
+        queue_results.send(res);
+    }
+
+    void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
    {
        task_server task;
        task.id = task_id;
@@ -1389,6 +1480,7 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
+        task.reranking_mode = rerank;
        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;

@@ -1520,7 +1612,7 @@ struct llama_server_context
            subtask_data["prompt"] = subtask_data["prompt"][i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
        }
    }

@@ -1559,6 +1651,7 @@ struct llama_server_context

                slot->infill       = task.infill_mode;
                slot->embedding    = task.embedding_mode;
+                slot->reranker    = task.reranking_mode;
                slot->task_id      = task.id;
                slot->multitask_id = task.multitask_id;

@@ -1996,7 +2089,15 @@ struct llama_server_context
                // prompt evaluated for embedding
                if (slot.embedding)
                {
-                    send_embedding(slot);
+                    send_embedding(slot, batch_view);
+                    slot.release();
+                    slot.i_batch = -1;
+                    continue;
+                }
+
+                if (slot.reranker)
+                {
+                    send_rerank(slot, batch_view);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
@@ -2090,7 +2191,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }

 std::function<void(int)> shutdown_handler;
-inline void signal_handler(int signal) { shutdown_handler(signal); }
+
+inline void signal_handler(int signal) {
+    exit(1);
+}
+

 /////////////////////////////////
 ////////////////////////////////
@@ -2286,15 +2391,15 @@ static std::string get_all_kv_cache_types() {
 }

 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params) {
+                                common_params & params, llama_server_context &llama) {
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

-    params.model = request->modelfile();
+    params.model.path = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2324,6 +2429,20 @@ static void params_parse(const backend::ModelOptions* request,
        add_rpc_devices(std::string(llama_grpc_servers));
    }
    
+     // decode options. Options are in form optname:optvale, or if booleans only optname.
+    for (int i = 0; i < request->options_size(); i++) {
+        std::string opt = request->options(i);
+        char *optname = strtok(&opt[0], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "gpu")) {
+            llama.has_gpu = true;
+        }
+    }
+
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
@@ -2355,7 +2474,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2393,12 +2512,12 @@ static void params_parse(const backend::ModelOptions* request,
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
-            trigger.word = request->grammartriggers(i).word();
-            trigger.at_start = request->grammartriggers(i).at_start();
-            llama.grammar_trigger_words.push_back(trigger);
+	    trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
+            trigger.value = request->grammartriggers(i).word();
+	    // trigger.at_start = request->grammartriggers(i).at_start();
+            llama.grammar_triggers.push_back(trigger);
            LOG_INFO("grammar trigger", {
-                { "word", trigger.word },
-                { "at_start", trigger.at_start }
+                { "word", trigger.value },
            });
        }
    }
@@ -2417,7 +2536,7 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
    common_params params;
-    params_parse(request, params);
+    params_parse(request, params, llama);

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2439,7 +2558,7 @@ public:
        json data = parse_options(true, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        while (true)
        {
            task_result result = llama.queue_results.recv(task_id);
@@ -2493,7 +2612,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
@@ -2530,7 +2649,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
        // get the result
        task_result result = llama.queue_results.recv(task_id);
        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
@@ -2562,6 +2681,46 @@ public:
        return grpc::Status::OK;
    }

+    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
+        // Create a JSON object with the query and documents
+        json data = {
+            {"prompt", request->query()},
+            {"documents", request->documents()},
+            {"top_n", request->top_n()}
+        };
+
+        // Generate a new task ID
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+
+        // Queue the task with reranking mode enabled
+        llama.request_completion(task_id, data, false, false, true, -1);
+
+        // Get the result
+        task_result result = llama.queue_results.recv(task_id);
+        llama.queue_results.remove_waiting_task_id(task_id);
+
+        if (!result.error && result.stop) {
+            // Set usage information
+            backend::Usage* usage = rerankResult->mutable_usage();
+            usage->set_total_tokens(result.result_json.value("tokens", 0));
+            usage->set_prompt_tokens(result.result_json.value("tokens", 0));
+
+            // Get the score from the result
+            float score = result.result_json.value("score", 0.0f);
+
+            // Create document results for each input document
+            for (int i = 0; i < request->documents_size(); i++) {
+                backend::DocumentResult* doc_result = rerankResult->add_results();
+                doc_result->set_index(i);
+                doc_result->set_text(request->documents(i));
+                doc_result->set_relevance_score(score);
+            }
+        }
+
+        return grpc::Status::OK;
+    }
+
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();

@@ -2594,7 +2753,9 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-
+  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
@@ -2603,6 +2764,20 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");

+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -21,6 +21,7 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -61,6 +61,7 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
+    bool reranking_mode = false;
    int multitask_id = -1;
 };

--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,6 +8,13 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

+GOCMD?=go
+CGO_LDFLAGS?=
+# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
+CGO_LDFLAGS_SYCL=
+GO_TAGS?=
+LD_FLAGS?=
+
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

@@ -21,7 +28,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
@@ -36,16 +43,35 @@ else ifeq ($(OS),Darwin)
 	endif
 endif

-# ifeq ($(BUILD_TYPE),sycl_f16)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-# endif
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON \
+		-DGGML_SYCL_F16=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif

-# ifeq ($(BUILD_TYPE),sycl_f32)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-# endif
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif

 # warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -86,11 +112,24 @@ endif
 	$(MAKE) $(COMBINED_LIB)

 gosd.o:
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
+else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
+endif

 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

+stablediffusion-ggml:
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
+	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
+ifneq ($(UPX),)
+	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
+endif
+
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -35,6 +35,8 @@ const char* sample_method_str[] = {
    "ipndm",
    "ipndm_v",
    "lcm",
+    "ddim_trailing",
+    "tcd",
 };

 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -173,6 +175,7 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
+			    0, // eta
                            width,
                            height,
                            sample_method, 
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,17 +0,0 @@
-.PHONY: autogptq
-autogptq: protogen
-	bash install.sh
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the autogptq project
-
-```
-make autogptq
-```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-from concurrent import futures
-import argparse
-import signal
-import sys
-import os
-import time
-import base64
-
-import grpc
-import backend_pb2
-import backend_pb2_grpc
-
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import TextGenerationPipeline
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            device = "cuda:0"
-            if request.Device != "":
-                device = request.Device
-
-            # support loading local model files
-            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
-            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
-
-            # support model `Qwen/Qwen-VL-Chat-Int4`
-            if "qwen-vl" in request.Model.lower():
-                self.model_name = "Qwen-VL-Chat"
-                model = AutoModelForCausalLM.from_pretrained(model_path, 
-                    trust_remote_code=request.TrustRemoteCode,
-                    device_map="auto").eval()
-            else:
-                model = AutoGPTQForCausalLM.from_quantized(model_path,
-                    model_basename=request.ModelBaseName,
-                    use_safetensors=True,
-                    trust_remote_code=request.TrustRemoteCode,
-                    device=device,
-                    use_triton=request.UseTriton,
-                    quantize_config=None)
-            
-            self.model = model
-            self.tokenizer = tokenizer
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.0
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        top_p = 0.95
-        if request.TopP != 0.0:
-            top_p = request.TopP
-
-        
-        prompt_images = self.recompile_vl_prompt(request)
-        compiled_prompt = prompt_images[0]
-        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
-
-        # Implement Predict RPC
-        pipeline = TextGenerationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer,
-            max_new_tokens=tokens,
-            temperature=request.Temperature,
-            top_p=top_p,
-            repetition_penalty=penalty,
-            )
-        t = pipeline(compiled_prompt)[0]["generated_text"]
-        print(f"generated_text: {t}", file=sys.stderr)
-        
-        if compiled_prompt in t:
-            t = t.replace(compiled_prompt, "")
-        # house keeping. Remove the image files from /tmp folder
-        for img_path in prompt_images[1]:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-    def recompile_vl_prompt(self, request):
-        prompt = request.Prompt
-        image_paths = []
-
-        if "qwen-vl" in self.model_name.lower():
-            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
-            # Then, save the image file paths to an array "image_paths".
-            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
-            for i, img in enumerate(request.Images):
-                timestamp = str(int(time.time() * 1000))  # Generate timestamp
-                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
-                with open(img_path, "wb") as f:
-                    f.write(base64.b64decode(img))
-                image_paths.append(img_path)
-                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
-        else:
-            prompt = request.Prompt
-        return (prompt, image_paths)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
-if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
-fi
-
-installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
-optimum[openvino]
-setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +0,0 @@
-accelerate
-auto-gptq==0.7.1
-grpcio==1.70.0
-protobuf
-certifi
-transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -1,4 +0,0 @@
-#!/bin/bash
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,4 +1,4 @@
-transformers
+transformers==4.48.3
 accelerate
 torch==2.4.1
 coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,5 +1,5 @@
 torch==2.4.1
 torchaudio==2.4.1
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 torchaudio==2.4.1+rocm6.0
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc

 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -287,6 +287,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
+            elif request.PipelineType == "Lumina2Text2ImgPipeline":
+                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
+                    request.Model,
+                    torch_dtype=torch.bfloat16)
+                if request.LowVRAM:
+                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@@ -516,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.70.0
+grpcio==1.71.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
            engine_args.max_model_len = request.MaxModelLen
+        if request.DisableLogStatus:
+            engine_args.disable_log_status = request.DisableLogStatus
+        if request.DType != "":
+            engine_args.dtype = request.DType
+        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
+            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
+            engine_args.limit_mm_per_prompt = {
+                "image": max(request.LimitImagePerPrompt, 1),
+                "video": max(request.LimitVideoPerPrompt, 1),
+                "audio": max(request.LimitAudioPerPrompt, 1)
+            }

        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -269,7 +280,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def load_image(self, image_path: str):
        """
        Load an image from the given file path or base64 encoded data.
-        
+
        Args:
            image_path (str): The path to the image file or base64 encoded data.

@@ -288,7 +299,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def load_video(self, video_path: str):
        """
        Load a video from the given file path.
-        
+
        Args:
            video_path (str): The path to the image file.

@@ -309,7 +320,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
@@ -335,4 +351,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    asyncio.run(serve(args.addr))
+    asyncio.run(serve(args.addr))
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.71.0
 protobuf
 certifi
 setuptools
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}

-	if options.LoadToMemory != nil {
+	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
@@ -116,6 +117,11 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		}

 		if tokenCallback != nil {
+
+			if c.TemplateConfig.ReplyPrefix != "" {
+				tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
+			}
+
 			ss := ""

 			var partialRune []byte
@@ -165,8 +171,13 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
 			tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

+			response := string(reply.Message)
+			if c.TemplateConfig.ReplyPrefix != "" {
+				response = c.TemplateConfig.ReplyPrefix + response
+			}
+
 			return LLMResponse{
-				Response: string(reply.Message),
+				Response: response,
 				Usage:    tokenUsage,
 			}, err
 		}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

-	if so.SingleBackend {
-		defOpts = append(defOpts, model.WithSingleActiveBackend())
-	}
-
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -121,8 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word:    t.Word,
-			AtStart: t.AtStart,
+			Word: t.Word,
 		})

 	}
@@ -159,35 +154,36 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
-		MMProj:               c.MMProj,
-		FlashAttention:       c.FlashAttention,
-		CacheTypeKey:         c.CacheTypeK,
-		CacheTypeValue:       c.CacheTypeV,
-		NoKVOffload:          c.NoKVOffloading,
-		YarnExtFactor:        c.YarnExtFactor,
-		YarnAttnFactor:       c.YarnAttnFactor,
-		YarnBetaFast:         c.YarnBetaFast,
-		YarnBetaSlow:         c.YarnBetaSlow,
-		NGQA:                 c.NGQA,
-		RMSNormEps:           c.RMSNormEps,
-		MLock:                mmlock,
-		RopeFreqBase:         c.RopeFreqBase,
-		RopeScaling:          c.RopeScaling,
-		Type:                 c.ModelType,
-		RopeFreqScale:        c.RopeFreqScale,
-		NUMA:                 c.NUMA,
-		Embeddings:           embeddings,
-		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(nGPULayers),
-		MMap:                 mmap,
-		MainGPU:              c.MainGPU,
-		Threads:              int32(*c.Threads),
-		TensorSplit:          c.TensorSplit,
-		// AutoGPTQ
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
-		Device:           c.AutoGPTQ.Device,
-		UseTriton:        c.AutoGPTQ.Triton,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		DisableLogStatus:     c.DisableLogStatus,
+		DType:                c.DType,
+		// LimitMMPerPrompt vLLM
+		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:              c.MMProj,
+		FlashAttention:      c.FlashAttention,
+		CacheTypeKey:        c.CacheTypeK,
+		CacheTypeValue:      c.CacheTypeV,
+		NoKVOffload:         c.NoKVOffloading,
+		YarnExtFactor:       c.YarnExtFactor,
+		YarnAttnFactor:      c.YarnAttnFactor,
+		YarnBetaFast:        c.YarnBetaFast,
+		YarnBetaSlow:        c.YarnBetaSlow,
+		NGQA:                c.NGQA,
+		RMSNormEps:          c.RMSNormEps,
+		MLock:               mmlock,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeScaling:         c.RopeScaling,
+		Type:                c.ModelType,
+		RopeFreqScale:       c.RopeFreqScale,
+		NUMA:                c.NUMA,
+		Embeddings:          embeddings,
+		LowVRAM:             lowVRAM,
+		NGPULayers:          int32(nGPULayers),
+		MMap:                mmap,
+		MainGPU:             c.MainGPU,
+		Threads:             int32(*c.Threads),
+		TensorSplit:         c.TensorSplit,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,10 +26,10 @@ func SoundGeneration(

 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,6 +20,7 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac

 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
-
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
+	defer loader.Close()

 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()

 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()
+
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -38,7 +38,7 @@ type RunCMD struct {

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`

 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -50,9 +50,6 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

-	// AutoGPTQ specifics
-	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
-
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@@ -130,25 +127,28 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

-	ContextSize          *int      `yaml:"context_size"`
-	NUMA                 bool      `yaml:"numa"`
-	LoraAdapter          string    `yaml:"lora_adapter"`
-	LoraBase             string    `yaml:"lora_base"`
-	LoraAdapters         []string  `yaml:"lora_adapters"`
-	LoraScales           []float32 `yaml:"lora_scales"`
-	LoraScale            float32   `yaml:"lora_scale"`
-	NoMulMatQ            bool      `yaml:"no_mulmatq"`
-	DraftModel           string    `yaml:"draft_model"`
-	NDraft               int32     `yaml:"n_draft"`
-	Quantization         string    `yaml:"quantization"`
-	LoadFormat           string    `yaml:"load_format"`
-	GPUMemoryUtilization float32   `yaml:"gpu_memory_utilization"` // vLLM
-	TrustRemoteCode      bool      `yaml:"trust_remote_code"`      // vLLM
-	EnforceEager         bool      `yaml:"enforce_eager"`          // vLLM
-	SwapSpace            int       `yaml:"swap_space"`             // vLLM
-	MaxModelLen          int       `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
-	MMProj               string    `yaml:"mmproj"`
+	ContextSize          *int             `yaml:"context_size"`
+	NUMA                 bool             `yaml:"numa"`
+	LoraAdapter          string           `yaml:"lora_adapter"`
+	LoraBase             string           `yaml:"lora_base"`
+	LoraAdapters         []string         `yaml:"lora_adapters"`
+	LoraScales           []float32        `yaml:"lora_scales"`
+	LoraScale            float32          `yaml:"lora_scale"`
+	NoMulMatQ            bool             `yaml:"no_mulmatq"`
+	DraftModel           string           `yaml:"draft_model"`
+	NDraft               int32            `yaml:"n_draft"`
+	Quantization         string           `yaml:"quantization"`
+	LoadFormat           string           `yaml:"load_format"`
+	GPUMemoryUtilization float32          `yaml:"gpu_memory_utilization"` // vLLM
+	TrustRemoteCode      bool             `yaml:"trust_remote_code"`      // vLLM
+	EnforceEager         bool             `yaml:"enforce_eager"`          // vLLM
+	SwapSpace            int              `yaml:"swap_space"`             // vLLM
+	MaxModelLen          int              `yaml:"max_model_len"`          // vLLM
+	TensorParallelSize   int              `yaml:"tensor_parallel_size"`   // vLLM
+	DisableLogStatus     bool             `yaml:"disable_log_stats"`      // vLLM
+	DType                string           `yaml:"dtype"`                  // vLLM
+	LimitMMPerPrompt     LimitMMPerPrompt `yaml:"limit_mm_per_prompt"`    // vLLM
+	MMProj               string           `yaml:"mmproj"`

 	FlashAttention bool   `yaml:"flash_attention"`
 	NoKVOffloading bool   `yaml:"no_kv_offloading"`
@@ -166,12 +166,11 @@ type LLMConfig struct {
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }

-// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
-type AutoGPTQ struct {
-	ModelBaseName    string `yaml:"model_base_name"`
-	Device           string `yaml:"device"`
-	Triton           bool   `yaml:"triton"`
-	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
+// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
+type LimitMMPerPrompt struct {
+	LimitImagePerPrompt int `yaml:"image"`
+	LimitVideoPerPrompt int `yaml:"video"`
+	LimitAudioPerPrompt int `yaml:"audio"`
 }

 // TemplateConfig is a struct that holds the configuration of the templating system
@@ -203,6 +202,8 @@ type TemplateConfig struct {
 	Multimodal string `yaml:"multimodal"`

 	JinjaTemplate bool `yaml:"jinja_template"`
+
+	ReplyPrefix string `yaml:"reply_prefix"`
 }

 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
@@ -212,7 +213,15 @@ func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
 		return err
 	}
 	*c = BackendConfig(aux)
+
 	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
+	// Make sure the usecases are valid, we rewrite with what we identified
+	c.KnownUsecaseStrings = []string{}
+	for k, usecase := range GetAllBackendConfigUsecases() {
+		if c.HasUsecases(usecase) {
+			c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
+		}
+	}
 	return nil
 }

@@ -369,16 +378,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}

-	// Value passed by the top level are treated as default (no implicit defaults)
-	// defaults are set by the user
-	if ctx == 0 {
-		ctx = 1024
-	}
-
-	if cfg.ContextSize == nil {
-		cfg.ContextSize = &ctx
-	}
-
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -400,7 +399,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}

-	guessDefaultsFromFile(cfg, lo.modelPath)
+	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
 }

 func (c *BackendConfig) Validate() bool {
@@ -472,6 +471,10 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 	}
 }

+func stringToFlag(s string) string {
+	return "FLAG_" + strings.ToUpper(s)
+}
+
 func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	if len(input) == 0 {
 		return nil
@@ -479,7 +482,7 @@ func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	result := FLAG_ANY
 	flags := GetAllBackendConfigUsecases()
 	for _, str := range input {
-		flag, exists := flags["FLAG_"+strings.ToUpper(str)]
+		flag, exists := flags[stringToFlag(str)]
 		if exists {
 			result |= flag
 		}
@@ -541,7 +544,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -0,0 +1,253 @@
+package config
+
+import (
+	"strings"
+
+	"github.com/rs/zerolog/log"
+
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+	DeepSeek2
+)
+
+const (
+	defaultContextSize = 1024
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+	RepeatPenalty  float64
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		RepeatPenalty: 1.0,
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<start_of_turn>model\n",
+			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
+
+	if defaultCtx == 0 && cfg.ContextSize == nil {
+		ctxSize := f.EstimateLLaMACppUsage().ContextSize
+		if ctxSize > 0 {
+			cSize := int(ctxSize)
+			cfg.ContextSize = &cSize
+		} else {
+			defaultCtx = defaultContextSize
+			cfg.ContextSize = &defaultCtx
+		}
+	}
+
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
+		}
+		if cfg.RepeatPenalty == 0.0 {
+			cfg.RepeatPenalty = settings.RepeatPenalty
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	deepseek2 := arch == "deepseek2"
+
+	switch {
+	case deepseek2:
+		return DeepSeek2
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
+	}
+}
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -3,147 +3,12 @@ package config
 import (
 	"os"
 	"path/filepath"
-	"strings"

 	"github.com/rs/zerolog/log"
-
 	gguf "github.com/thxcode/gguf-parser-go"
 )

-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
-
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 		return
 	}

-	if cfg.HasTemplate() {
-		// nothing to guess here
-		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
-		return
-	}
-
 	// We try to guess only if we don't have a template defined already
 	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+
+	// try to parse the gguf file
 	f, err := gguf.ParseGGUFFile(guessPath)
-	if err != nil {
-		// Only valid for gguf files
-		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+	if err == nil {
+		guessGGUFFromFile(cfg, f, defaultCtx)
 		return
 	}

-	log.Debug().
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
-		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
-
-	// guess the name
-	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
-	}
-
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
+	if cfg.ContextSize == nil {
+		if defaultCtx == 0 {
+			defaultCtx = defaultContextSize
 		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
+		cfg.ContextSize = &defaultCtx
 	}
 }
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -29,6 +29,8 @@ func InstallModelFromGallery(galleries []config.Gallery, name string, basePath s
 			if err != nil {
 				return err
 			}
+			config.Description = model.Description
+			config.License = model.License
 		} else if len(model.ConfigFile) > 0 {
 			// TODO: is this worse than using the override method with a blank cfg yaml?
 			reYamlConfig, err := yaml.Marshal(model.ConfigFile)
@@ -114,7 +116,7 @@ func FindModel(models []*GalleryModel, name string, basePath string) *GalleryMod
 // List available models
 // Models galleries are a list of yaml files that are hosted on a remote server (for example github).
 // Each yaml file contains a list of models that can be downloaded and optionally overrides to define a new model setting.
-func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*GalleryModel, error) {
+func AvailableGalleryModels(galleries []config.Gallery, basePath string) (GalleryModels, error) {
 	var models []*GalleryModel

 	// Get models from galleries
--- a/core/gallery/request.go
+++ b/core/gallery/request.go
@@ -62,3 +62,15 @@ func (gm GalleryModels) FindByName(name string) *GalleryModel {
 	}
 	return nil
 }
+
+func (gm GalleryModels) Paginate(pageNum int, itemsNum int) GalleryModels {
+	start := (pageNum - 1) * itemsNum
+	end := start + itemsNum
+	if start > len(gm) {
+		start = len(gm)
+	}
+	if end > len(gm) {
+		end = len(gm)
+	}
+	return gm[start:end]
+}
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -139,6 +139,28 @@ func API(application *application.Application) (*fiber.App, error) {
 		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}

+	httpFS := http.FS(embedDirStatic)
+
+	router.Use(favicon.New(favicon.Config{
+		URL:        "/favicon.svg",
+		FileSystem: httpFS,
+		File:       "static/favicon.svg",
+	}))
+
+	router.Use("/static", filesystem.New(filesystem.Config{
+		Root:       httpFS,
+		PathPrefix: "static",
+		Browse:     true,
+	}))
+
+	if application.ApplicationConfig().ImageDir != "" {
+		router.Static("/generated-images", application.ApplicationConfig().ImageDir)
+	}
+
+	if application.ApplicationConfig().AudioDir != "" {
+		router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
+	}
+
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
 	router.Use(v2keyauth.New(*kaConfig))

@@ -176,20 +198,6 @@ func API(application *application.Application) (*fiber.App, error) {
 	}
 	routes.RegisterJINARoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())

-	httpFS := http.FS(embedDirStatic)
-
-	router.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.ico",
-		FileSystem: httpFS,
-		File:       "static/favicon.ico",
-	}))
-
-	router.Use("/static", filesystem.New(filesystem.Config{
-		Root:       httpFS,
-		PathPrefix: "static",
-		Browse:     true,
-	}))
-
 	// Define a custom 404 handler
 	// Note: keep this at the bottom!
 	router.Use(notFoundHandler)
--- a/core/http/elements/buttons.go
+++ b/core/http/elements/buttons.go
@@ -13,7 +13,7 @@ func installButton(galleryName string) elem.Node {
 		attrs.Props{
 			"data-twe-ripple-init":  "",
 			"data-twe-ripple-color": "light",
-			"class":                 "float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
+			"class":                 "float-right inline-flex items-center rounded-lg bg-blue-600 hover:bg-blue-700 px-4 py-2 text-sm font-medium text-white transition duration-300 ease-in-out shadow hover:shadow-lg",
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
 			"hx-post": "browse/install/model/" + galleryName,
@@ -52,7 +52,7 @@ func infoButton(m *gallery.GalleryModel) elem.Node {
 		attrs.Props{
 			"data-twe-ripple-init":  "",
 			"data-twe-ripple-color": "light",
-			"class":                 "float-left inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
+			"class":                 "inline-flex items-center rounded-lg bg-gray-700 hover:bg-gray-600 px-4 py-2 text-sm font-medium text-white transition duration-300 ease-in-out",
 			"data-modal-target":     modalName(m),
 			"data-modal-toggle":     modalName(m),
 		},
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -17,7 +17,7 @@ const (
 func cardSpan(text, icon string) elem.Node {
 	return elem.Span(
 		attrs.Props{
-			"class": "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
+			"class": "inline-flex items-center px-3 py-1 rounded-lg text-xs font-medium bg-gray-700/70 text-gray-300 border border-gray-600/50 mr-2 mb-2",
 		},
 		elem.I(attrs.Props{
 			"class": icon + " pr-2",
@@ -39,19 +39,20 @@ func searchableElement(text, icon string) elem.Node {
 		),
 		elem.Span(
 			attrs.Props{
-				"class": "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2",
+				"class": "inline-flex items-center text-xs px-3 py-1 rounded-full bg-gray-700/60 text-gray-300 border border-gray-600/50 hover:bg-gray-600 hover:text-gray-100 transition duration-200 ease-in-out",
 			},
 			elem.A(
 				attrs.Props{
 					//	"name":      "search",
 					//	"value":     text,
 					//"class":     "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
-					"href":      "#!",
-					"hx-post":   "browse/search/models",
-					"hx-target": "#search-results",
+					//"href":      "#!",
+					"href": "browse?term=" + text,
+					//"hx-post":   "browse/search/models",
+					//"hx-target": "#search-results",
 					// TODO: this doesn't work
 					//	"hx-vals":      `{ \"search\": \"` + text + `\" }`,
-					"hx-indicator": ".htmx-indicator",
+					//"hx-indicator": ".htmx-indicator",
 				},
 				elem.I(attrs.Props{
 					"class": icon + " pr-2",
@@ -101,7 +102,7 @@ func modalName(m *gallery.GalleryModel) string {
 	return m.Name + "-modal"
 }

-func modelDescription(m *gallery.GalleryModel) elem.Node {
+func modelModal(m *gallery.GalleryModel) elem.Node {
 	urls := []elem.Node{}
 	for _, url := range m.URLs {
 		urls = append(urls,
@@ -116,6 +117,123 @@ func modelDescription(m *gallery.GalleryModel) elem.Node {
 		)
 	}

+	return elem.Div(
+		attrs.Props{
+			"id":          modalName(m),
+			"tabindex":    "-1",
+			"aria-hidden": "true",
+			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
+		},
+		elem.Div(
+			attrs.Props{
+				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
+			},
+			elem.Div(
+				attrs.Props{
+					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
+				},
+				// header
+				elem.Div(
+					attrs.Props{
+						"class": "flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600",
+					},
+					elem.H3(
+						attrs.Props{
+							"class": "text-xl font-semibold text-gray-900 dark:text-white",
+						},
+						elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
+					),
+					elem.Button( // close button
+						attrs.Props{
+							"class":           "text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white",
+							"data-modal-hide": modalName(m),
+						},
+						elem.Raw(
+							`<svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
+							<path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
+						</svg>`,
+						),
+						elem.Span(
+							attrs.Props{
+								"class": "sr-only",
+							},
+							elem.Text("Close modal"),
+						),
+					),
+				),
+				// body
+				elem.Div(
+					attrs.Props{
+						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
+					},
+					elem.Div(
+						attrs.Props{
+							"class": "flex justify-center items-center",
+						},
+						elem.Img(attrs.Props{
+							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
+							"src":     m.Icon,
+							"loading": "lazy",
+						}),
+					),
+					elem.P(
+						attrs.Props{
+							"class": "text-base leading-relaxed text-gray-500 dark:text-gray-400",
+						},
+						elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
+					),
+					elem.Hr(
+						attrs.Props{},
+					),
+					elem.P(
+						attrs.Props{
+							"class": "text-sm font-semibold text-gray-900 dark:text-white",
+						},
+						elem.Text("Links"),
+					),
+					elem.Ul(
+						attrs.Props{},
+						urls...,
+					),
+					elem.If(
+						len(m.Tags) > 0,
+						elem.Div(
+							attrs.Props{},
+							elem.P(
+								attrs.Props{
+									"class": "text-sm mb-5 font-semibold text-gray-900 dark:text-white",
+								},
+								elem.Text("Tags"),
+							),
+							elem.Div(
+								attrs.Props{
+									"class": "flex flex-row flex-wrap content-center",
+								},
+								tagsNodes...,
+							),
+						),
+						elem.Div(attrs.Props{}),
+					),
+				),
+				// Footer
+				elem.Div(
+					attrs.Props{
+						"class": "flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600",
+					},
+					elem.Button(
+						attrs.Props{
+							"data-modal-hide": modalName(m),
+							"class":           "py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700",
+						},
+						elem.Text("Close"),
+					),
+				),
+			),
+		),
+	)
+}
+
+func modelDescription(m *gallery.GalleryModel) elem.Node {
 	return elem.Div(
 		attrs.Props{
 			"class": "p-6 text-surface dark:text-white",
@@ -132,122 +250,6 @@ func modelDescription(m *gallery.GalleryModel) elem.Node {
 			},
 			elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
 		),
-
-		elem.Div(
-			attrs.Props{
-				"id":          modalName(m),
-				"tabindex":    "-1",
-				"aria-hidden": "true",
-				"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
-			},
-			elem.Div(
-				attrs.Props{
-					"class": "relative p-4 w-full max-w-2xl max-h-full",
-				},
-				elem.Div(
-					attrs.Props{
-						"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
-					},
-					// header
-					elem.Div(
-						attrs.Props{
-							"class": "flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600",
-						},
-						elem.H3(
-							attrs.Props{
-								"class": "text-xl font-semibold text-gray-900 dark:text-white",
-							},
-							elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
-						),
-						elem.Button( // close button
-							attrs.Props{
-								"class":           "text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white",
-								"data-modal-hide": modalName(m),
-							},
-							elem.Raw(
-								`<svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
-									<path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
-								</svg>`,
-							),
-							elem.Span(
-								attrs.Props{
-									"class": "sr-only",
-								},
-								elem.Text("Close modal"),
-							),
-						),
-					),
-					// body
-					elem.Div(
-						attrs.Props{
-							"class": "p-4 md:p-5 space-y-4",
-						},
-						elem.Div(
-							attrs.Props{
-								"class": "flex justify-center items-center",
-							},
-							elem.Img(attrs.Props{
-								//	"class": "rounded-t-lg object-fit object-center h-96",
-								"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
-								"src":     m.Icon,
-								"loading": "lazy",
-							}),
-						),
-						elem.P(
-							attrs.Props{
-								"class": "text-base leading-relaxed text-gray-500 dark:text-gray-400",
-							},
-							elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
-						),
-						elem.Hr(
-							attrs.Props{},
-						),
-						elem.P(
-							attrs.Props{
-								"class": "text-sm font-semibold text-gray-900 dark:text-white",
-							},
-							elem.Text("Links"),
-						),
-						elem.Ul(
-							attrs.Props{},
-							urls...,
-						),
-						elem.If(
-							len(m.Tags) > 0,
-							elem.Div(
-								attrs.Props{},
-								elem.P(
-									attrs.Props{
-										"class": "text-sm mb-5 font-semibold text-gray-900 dark:text-white",
-									},
-									elem.Text("Tags"),
-								),
-								elem.Div(
-									attrs.Props{
-										"class": "flex flex-row flex-wrap content-center",
-									},
-									tagsNodes...,
-								),
-							),
-							elem.Div(attrs.Props{}),
-						),
-					),
-					// Footer
-					elem.Div(
-						attrs.Props{
-							"class": "flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600",
-						},
-						elem.Button(
-							attrs.Props{
-								"data-modal-hide": modalName(m),
-								"class":           "py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700",
-							},
-							elem.Text("Close"),
-						),
-					),
-				),
-			),
-		),
 	)
 }

@@ -397,7 +399,7 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
 		modelsElements = append(modelsElements,
 			elem.Div(
 				attrs.Props{
-					"class": " me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2",
+					"class": " me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2 bg-gray-800/90 border border-gray-700/50 rounded-xl overflow-hidden transition-all duration-300 hover:shadow-lg hover:shadow-blue-900/20 hover:-translate-y-1 hover:border-blue-700/50",
 				},
 				elem.Div(
 					attrs.Props{
@@ -406,6 +408,7 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
 					elems...,
 				),
 			),
+			modelModal(m),
 		)
 	}

--- a/core/http/elements/p2p.go
+++ b/core/http/elements/p2p.go
@@ -2,6 +2,7 @@ package elements

 import (
 	"fmt"
+	"time"

 	"github.com/chasefleming/elem-go"
 	"github.com/chasefleming/elem-go/attrs"
@@ -18,19 +19,6 @@ func renderElements(n []elem.Node) string {
 }

 func P2PNodeStats(nodes []p2p.NodeData) string {
-	/*
-	   <div class="bg-gray-800 p-6 rounded-lg shadow-lg text-left">
-	                       <p class="text-xl font-semibold text-gray-200">Total Workers Detected: {{ len .Nodes }}</p>
-	                       {{ $online := 0 }}
-	                       {{ range .Nodes }}
-	                           {{ if .IsOnline }}
-	                               {{ $online = add $online 1 }}
-	                           {{ end }}
-	                       {{ end }}
-	                       <p class="text-xl font-semibold text-gray-200">Total Online Workers: {{$online}}</p>
-	                   </div>
-	*/
-
 	online := 0
 	for _, n := range nodes {
 		if n.IsOnline() {
@@ -38,27 +26,21 @@ func P2PNodeStats(nodes []p2p.NodeData) string {
 		}
 	}

-	class := "text-green-500"
+	class := "text-blue-400"
 	if online == 0 {
-		class = "text-red-500"
+		class = "text-red-400"
 	}
-	/*
-	   <i class="fas fa-circle animate-pulse text-green-500 ml-2 mr-1"></i>
-	*/
-	circle := elem.I(attrs.Props{
-		"class": "fas fa-circle animate-pulse " + class + " ml-2 mr-1",
-	})
+
 	nodesElements := []elem.Node{
 		elem.Span(
 			attrs.Props{
-				"class": class,
+				"class": class + " font-bold text-xl",
 			},
-			circle,
 			elem.Text(fmt.Sprintf("%d", online)),
 		),
 		elem.Span(
 			attrs.Props{
-				"class": "text-gray-200",
+				"class": "text-gray-300 text-xl",
 			},
 			elem.Text(fmt.Sprintf("/%d", len(nodes))),
 		),
@@ -68,77 +50,73 @@ func P2PNodeStats(nodes []p2p.NodeData) string {
 }

 func P2PNodeBoxes(nodes []p2p.NodeData) string {
-	/*
-			<div class="bg-gray-800 p-4 rounded-lg shadow-lg text-left">
-			<div class="flex items-center mb-2">
-				<i class="fas fa-desktop text-gray-400 mr-2"></i>
-				<span class="text-gray-200 font-semibold">{{.ID}}</span>
-			</div>
-			<p class="text-sm text-gray-400 mt-2 flex items-center">
-				Status:
-				<i class="fas fa-circle {{ if .IsOnline }}text-green-500{{ else }}text-red-500{{ end }} ml-2 mr-1"></i>
-				<span class="{{ if .IsOnline }}text-green-400{{ else }}text-red-400{{ end }}">
-					{{ if .IsOnline }}Online{{ else }}Offline{{ end }}
-				</span>
-			</p>
-		</div>
-	*/
-
 	nodesElements := []elem.Node{}

 	for _, n := range nodes {
+		nodeID := bluemonday.StrictPolicy().Sanitize(n.ID)
+
+		// Define status-specific classes
+		statusIconClass := "text-green-400"
+		statusText := "Online"
+		statusTextClass := "text-green-400"
+
+		if !n.IsOnline() {
+			statusIconClass = "text-red-400"
+			statusText = "Offline"
+			statusTextClass = "text-red-400"
+		}

 		nodesElements = append(nodesElements,
 			elem.Div(
 				attrs.Props{
-					"class": "bg-gray-700 p-6 rounded-lg shadow-lg text-left",
+					"class": "bg-gray-800/80 border border-gray-700/50 rounded-xl p-4 shadow-lg transition-all duration-300 hover:shadow-blue-900/20 hover:border-blue-700/50",
 				},
-				elem.P(
+				// Node ID and status indicator in top row
+				elem.Div(
 					attrs.Props{
-						"class": "text-sm text-gray-400 mt-2 flex",
+						"class": "flex items-center justify-between mb-3",
 					},
-					elem.I(
+					// Node ID with icon
+					elem.Div(
 						attrs.Props{
-							"class": "fas fa-desktop text-gray-400 mr-2",
+							"class": "flex items-center",
 						},
-					),
-					elem.Text("Name: "),
-					elem.Span(
-						attrs.Props{
-							"class": "text-gray-200 font-semibold ml-2 mr-1",
-						},
-						elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
-					),
-					elem.Text("Status: "),
-					elem.If(
-						n.IsOnline(),
 						elem.I(
 							attrs.Props{
-								"class": "fas fa-circle animate-pulse text-green-500 ml-2 mr-1",
+								"class": "fas fa-server text-blue-400 mr-2",
 							},
 						),
-						elem.I(
-							attrs.Props{
-								"class": "fas fa-circle animate-pulse text-red-500 ml-2 mr-1",
-							},
-						),
-					),
-					elem.If(
-						n.IsOnline(),
-						elem.Span(
-							attrs.Props{
-								"class": "text-green-400",
-							},
-
-							elem.Text("Online"),
-						),
 						elem.Span(
 							attrs.Props{
-								"class": "text-red-400",
+								"class": "text-white font-medium",
 							},
-							elem.Text("Offline"),
+							elem.Text(nodeID),
 						),
 					),
+					// Status indicator
+					elem.Div(
+						attrs.Props{
+							"class": "flex items-center",
+						},
+						elem.I(
+							attrs.Props{
+								"class": "fas fa-circle animate-pulse " + statusIconClass + " mr-1.5",
+							},
+						),
+						elem.Span(
+							attrs.Props{
+								"class": statusTextClass,
+							},
+							elem.Text(statusText),
+						),
+					),
+				),
+				// Bottom section with timestamp
+				elem.Div(
+					attrs.Props{
+						"class": "text-xs text-gray-400 pt-1 border-t border-gray-700/30",
+					},
+					elem.Text("Last updated: "+time.Now().UTC().Format("2006-01-02 15:04:05")),
 				),
 			))
 	}
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath)
+	var ml = model.NewModelLoader(modelPath, false)

 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
 	httpFS := http.FS(embedDirStatic)

 	app.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.ico",
+		URL:        "/favicon.svg",
 		FileSystem: httpFS,
-		File:       "static/favicon.ico",
+		File:       "static/favicon.svg",
 	}))

 	app.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}

-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)

 	// Stores
-	sl := model.NewModelLoader("")
-	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))

 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -112,14 +112,6 @@ func RegisterOpenAIRoutes(app *fiber.App,
 		re.SetOpenAIRequest,
 		openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))

-	if application.ApplicationConfig().ImageDir != "" {
-		app.Static("/generated-images", application.ApplicationConfig().ImageDir)
-	}
-
-	if application.ApplicationConfig().AudioDir != "" {
-		app.Static("/generated-audio", application.ApplicationConfig().AudioDir)
-	}
-
 	// List models
 	app.Get("/v1/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -3,7 +3,9 @@ package routes
 import (
 	"fmt"
 	"html/template"
+	"math"
 	"sort"
+	"strconv"
 	"strings"

 	"github.com/mudler/LocalAI/core/config"
@@ -126,6 +128,8 @@ func RegisterUIRoutes(app *fiber.App,
 		// Show the Models page (all models)
 		app.Get("/browse", func(c *fiber.Ctx) error {
 			term := c.Query("term")
+			page := c.Query("page")
+			items := c.Query("items")

 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)

@@ -164,6 +168,47 @@ func RegisterUIRoutes(app *fiber.App,
 				//	"ApplicationConfig": appConfig,
 			}

+			if page == "" {
+				page = "1"
+			}
+
+			if page != "" {
+				// return a subset of the models
+				pageNum, err := strconv.Atoi(page)
+				if err != nil {
+					return c.Status(fiber.StatusBadRequest).SendString("Invalid page number")
+				}
+
+				if pageNum == 0 {
+					return c.Render("views/models", summary)
+				}
+
+				itemsNum, err := strconv.Atoi(items)
+				if err != nil {
+					itemsNum = 21
+				}
+
+				totalPages := int(math.Ceil(float64(len(models)) / float64(itemsNum)))
+
+				models = models.Paginate(pageNum, itemsNum)
+
+				prevPage := pageNum - 1
+				nextPage := pageNum + 1
+				if prevPage < 1 {
+					prevPage = 1
+				}
+				if nextPage > totalPages {
+					nextPage = totalPages
+				}
+				if prevPage != pageNum {
+					summary["PrevPage"] = prevPage
+				}
+				summary["NextPage"] = nextPage
+				summary["TotalPages"] = totalPages
+				summary["CurrentPage"] = pageNum
+				summary["Models"] = template.HTML(elements.ListModels(models, processingModels, galleryService))
+			}
+
 			// Render index
 			return c.Render("views/models", summary)
 		})
@@ -171,6 +216,9 @@ func RegisterUIRoutes(app *fiber.App,
 		// Show the models, filtered from the user input
 		// https://htmx.org/examples/active-search/
 		app.Post("/browse/search/models", func(c *fiber.Ctx) error {
+			page := c.Query("page")
+			items := c.Query("items")
+
 			form := struct {
 				Search string `form:"search"`
 			}{}
@@ -180,7 +228,26 @@ func RegisterUIRoutes(app *fiber.App,

 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)

-			return c.SendString(elements.ListModels(gallery.GalleryModels(models).Search(form.Search), processingModels, galleryService))
+			if page != "" {
+				// return a subset of the models
+				pageNum, err := strconv.Atoi(page)
+				if err != nil {
+					return c.Status(fiber.StatusBadRequest).SendString("Invalid page number")
+				}
+
+				itemsNum, err := strconv.Atoi(items)
+				if err != nil {
+					itemsNum = 21
+				}
+
+				models = models.Paginate(pageNum, itemsNum)
+			}
+
+			if form.Search != "" {
+				models = models.Search(form.Search)
+			}
+
+			return c.SendString(elements.ListModels(models, processingModels, galleryService))
 		})

 		/*
@@ -305,23 +372,6 @@ func RegisterUIRoutes(app *fiber.App,
 		})
 	}

-	// Show the Chat page
-	app.Get("/chat/:model", func(c *fiber.Ctx) error {
-		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
-
-		summary := fiber.Map{
-			"Title":        "LocalAI - Chat with " + c.Params("model"),
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        c.Params("model"),
-			"Version":      internal.PrintableVersion(),
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
-		}
-
-		// Render index
-		return c.Render("views/chat", summary)
-	})
-
 	app.Get("/talk/", func(c *fiber.Ctx) error {
 		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)

@@ -344,21 +394,73 @@ func RegisterUIRoutes(app *fiber.App,
 	})

 	app.Get("/chat/", func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

-		backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
-
-		if len(backendConfigs) == 0 {
+		if len(backendConfigs)+len(modelsWithoutConfig) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
 			return c.Redirect(utils.BaseURL(c))
 		}
+		modelThatCanBeUsed := ""
+		galleryConfigs := map[string]*gallery.Config{}
+
+		for _, m := range backendConfigs {
+			cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
+			if err != nil {
+				continue
+			}
+			galleryConfigs[m.Name] = cfg
+		}
+
+		title := "LocalAI - Chat"
+
+		for _, b := range backendConfigs {
+			if b.HasUsecases(config.FLAG_CHAT) {
+				modelThatCanBeUsed = b.Name
+				title = "LocalAI - Chat with " + modelThatCanBeUsed
+				break
+			}
+		}

 		summary := fiber.Map{
-			"Title":        "LocalAI - Chat with " + backendConfigs[0],
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        backendConfigs[0],
-			"Version":      internal.PrintableVersion(),
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
+			"Title":               title,
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"GalleryConfig":       galleryConfigs,
+			"ModelsConfig":        backendConfigs,
+			"Model":               modelThatCanBeUsed,
+			"Version":             internal.PrintableVersion(),
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
+		}
+
+		// Render index
+		return c.Render("views/chat", summary)
+	})
+
+	// Show the Chat page
+	app.Get("/chat/:model", func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
+
+		galleryConfigs := map[string]*gallery.Config{}
+
+		for _, m := range backendConfigs {
+			cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
+			if err != nil {
+				continue
+			}
+			galleryConfigs[m.Name] = cfg
+		}
+
+		summary := fiber.Map{
+			"Title":               "LocalAI - Chat with " + c.Params("model"),
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsConfig":        backendConfigs,
+			"GalleryConfig":       galleryConfigs,
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"Model":               c.Params("model"),
+			"Version":             internal.PrintableVersion(),
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
 		}

 		// Render index
@@ -367,14 +469,16 @@ func RegisterUIRoutes(app *fiber.App,

 	app.Get("/text2image/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

 		summary := fiber.Map{
-			"Title":        "LocalAI - Generate images with " + c.Params("model"),
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        c.Params("model"),
-			"Version":      internal.PrintableVersion(),
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
+			"Title":               "LocalAI - Generate images with " + c.Params("model"),
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsConfig":        backendConfigs,
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"Model":               c.Params("model"),
+			"Version":             internal.PrintableVersion(),
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
 		}

 		// Render index
@@ -382,21 +486,33 @@ func RegisterUIRoutes(app *fiber.App,
 	})

 	app.Get("/text2image/", func(c *fiber.Ctx) error {
-
 		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

-		if len(backendConfigs) == 0 {
+		if len(backendConfigs)+len(modelsWithoutConfig) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
 			return c.Redirect(utils.BaseURL(c))
 		}

+		modelThatCanBeUsed := ""
+		title := "LocalAI - Generate images"
+
+		for _, b := range backendConfigs {
+			if b.HasUsecases(config.FLAG_IMAGE) {
+				modelThatCanBeUsed = b.Name
+				title = "LocalAI - Generate images with " + modelThatCanBeUsed
+				break
+			}
+		}
+
 		summary := fiber.Map{
-			"Title":        "LocalAI - Generate images with " + backendConfigs[0].Name,
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        backendConfigs[0].Name,
-			"Version":      internal.PrintableVersion(),
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
+			"Title":               title,
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsConfig":        backendConfigs,
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"Model":               modelThatCanBeUsed,
+			"Version":             internal.PrintableVersion(),
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
 		}

 		// Render index
@@ -405,14 +521,16 @@ func RegisterUIRoutes(app *fiber.App,

 	app.Get("/tts/:model", func(c *fiber.Ctx) error {
 		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

 		summary := fiber.Map{
-			"Title":        "LocalAI - Generate images with " + c.Params("model"),
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        c.Params("model"),
-			"Version":      internal.PrintableVersion(),
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
+			"Title":               "LocalAI - Generate images with " + c.Params("model"),
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsConfig":        backendConfigs,
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"Model":               c.Params("model"),
+			"Version":             internal.PrintableVersion(),
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
 		}

 		// Render index
@@ -420,21 +538,32 @@ func RegisterUIRoutes(app *fiber.App,
 	})

 	app.Get("/tts/", func(c *fiber.Ctx) error {
-
 		backendConfigs := cl.GetAllBackendConfigs()
+		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

-		if len(backendConfigs) == 0 {
+		if len(backendConfigs)+len(modelsWithoutConfig) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
 			return c.Redirect(utils.BaseURL(c))
 		}

+		modelThatCanBeUsed := ""
+		title := "LocalAI - Generate audio"
+
+		for _, b := range backendConfigs {
+			if b.HasUsecases(config.FLAG_TTS) {
+				modelThatCanBeUsed = b.Name
+				title = "LocalAI - Generate audio with " + modelThatCanBeUsed
+				break
+			}
+		}
 		summary := fiber.Map{
-			"Title":        "LocalAI - Generate audio with " + backendConfigs[0].Name,
-			"BaseURL":      utils.BaseURL(c),
-			"ModelsConfig": backendConfigs,
-			"Model":        backendConfigs[0].Name,
-			"IsP2PEnabled": p2p.IsP2PEnabled(),
-			"Version":      internal.PrintableVersion(),
+			"Title":               title,
+			"BaseURL":             utils.BaseURL(c),
+			"ModelsConfig":        backendConfigs,
+			"ModelsWithoutConfig": modelsWithoutConfig,
+			"Model":               modelThatCanBeUsed,
+			"IsP2PEnabled":        p2p.IsP2PEnabled(),
+			"Version":             internal.PrintableVersion(),
 		}

 		// Render index
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -27,10 +27,19 @@ SOFTWARE.

 */

-function submitKey(event) {
-    event.preventDefault();
-    localStorage.setItem("key", document.getElementById("apiKey").value);
-    document.getElementById("apiKey").blur();
+function toggleLoader(show) {
+  const loader = document.getElementById('loader');
+  const sendButton = document.getElementById('send-button');
+  
+  if (show) {
+    loader.style.display = 'block';
+    sendButton.style.display = 'none';
+    document.getElementById("input").disabled = true;
+  } else {
+    document.getElementById("input").disabled = false;
+    loader.style.display = 'none';
+    sendButton.style.display = 'block';
+  }
 }

 function submitSystemPrompt(event) {
@@ -47,10 +56,9 @@ function submitPrompt(event) {
  const input = document.getElementById("input").value;
  Alpine.store("chat").add("user", input, image);
  document.getElementById("input").value = "";
-  const key = localStorage.getItem("key");
  const systemPrompt = localStorage.getItem("system_prompt");
-
-  promptGPT(systemPrompt, key, input);
+  Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });
+  promptGPT(systemPrompt, input);
 }

 function readInputImage() {
@@ -67,14 +75,13 @@ function readInputImage() {
 }


-  async function promptGPT(systemPrompt, key, input) {
+  async function promptGPT(systemPrompt, input) {
    const model = document.getElementById("chat-model").value;
    // Set class "loader" to the element with "loader" id
    //document.getElementById("loader").classList.add("loader");
    // Make the "loader" visible
-    document.getElementById("loader").style.display = "block";
-    document.getElementById("input").disabled = true;
-    document.getElementById('messages').scrollIntoView(false)
+    toggleLoader(true);
+

    messages = Alpine.store("chat").messages();

@@ -146,7 +153,6 @@ function readInputImage() {
    const response = await fetch("v1/chat/completions", {
      method: "POST",
      headers: {
-        Authorization: `Bearer ${key}`,
        "Content-Type": "application/json",
      },
      body: JSON.stringify({
@@ -181,8 +187,8 @@ function readInputImage() {
      const chatStore = Alpine.store("chat");
      chatStore.add("assistant", token);
      // Efficiently scroll into view without triggering multiple reflows
-      const messages = document.getElementById('messages');
-      messages.scrollTop = messages.scrollHeight;
+      // const messages = document.getElementById('messages');
+      // messages.scrollTop = messages.scrollHeight;
    };

    let buffer = "";
@@ -244,30 +250,20 @@ function readInputImage() {
    }

    // Remove class "loader" from the element with "loader" id
-    //document.getElementById("loader").classList.remove("loader");
-    document.getElementById("loader").style.display = "none";
-    // enable input
-    document.getElementById("input").disabled = false;
+    toggleLoader(false);
+
    // scroll to the bottom of the chat
    document.getElementById('messages').scrollIntoView(false)
    // set focus to the input
    document.getElementById("input").focus();
  }

-  document.getElementById("key").addEventListener("submit", submitKey);
  document.getElementById("system_prompt").addEventListener("submit", submitSystemPrompt);

  document.getElementById("prompt").addEventListener("submit", submitPrompt);
  document.getElementById("input").focus();
  document.getElementById("input_image").addEventListener("change", readInputImage);

-  storeKey = localStorage.getItem("key");
-  if (storeKey) {
-    document.getElementById("apiKey").value = storeKey;
-  } else {
-    document.getElementById("apiKey").value = null;
-  }
-
  storesystemPrompt = localStorage.getItem("system_prompt");
  if (storesystemPrompt) {
    document.getElementById("systemPrompt").value = storesystemPrompt;
--- a/core/http/static/favicon.ico
+++ b/core/http/static/favicon.ico
--- a/Show More
+++ b/Show More