chore(model gallery): add qihoo360_tinyr1-32b-preview

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-03 03:02:38 -05:00 · 2025-03-02 10:23:17 +01:00
163 changed files with 2473 additions and 6263 deletions
--- a/.env
+++ b/.env
@@ -29,9 +29,6 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

-# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
-# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
-
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,6 +29,10 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/autogptq"
+    schedule:
+      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.2
+        uses: appleboy/ssh-action@v1.2.1
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.2
+        uses: appleboy/ssh-action@v1.2.1
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,7 +75,6 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-            latest-image: 'latest-gpu-hipblas-core'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -252,7 +251,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-            latest-image: 'latest-gpu-intel-f16-core'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -263,7 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-            latest-image: 'latest-gpu-intel-f32-core'

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -342,7 +339,6 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
-            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -355,18 +351,17 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
-            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
+            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
-            latest-image: 'latest-gpu-vulkan-core'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.3
+        uses: securego/gosec@v2.22.0
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/6
+++ b/6
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -24,7 +24,6 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
-        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -431,6 +430,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/autogptq \
+    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/30
+++ b/30
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36
+CPPLLAMA_VERSION?=1782cdfed60952f9ff333fc2ab5245f2be702453

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -21,8 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -260,7 +260,11 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a

 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion-ggml
+endif

 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -505,10 +509,18 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+
+.PHONY: autogptq-protogen
+autogptq-protogen:
+	$(MAKE) -C backend/python/autogptq protogen
+
+.PHONY: autogptq-protogen-clean
+autogptq-protogen-clean:
+	$(MAKE) -C backend/python/autogptq protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@@ -585,6 +597,7 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
+	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@@ -796,8 +809,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--progress plain \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -805,7 +817,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <h1 align="center">
  <br>
-  <img height="300" src="./core/http/static/logo.png"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
+    LocalAI
 <br>
 </h1>

@@ -47,58 +48,9 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-
-## 📚🆕 Local Stack Family
-
-🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
-
-<table>
-  <tr>
-    <td width="50%" valign="top">
-      <a href="https://github.com/mudler/LocalAGI">
-        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
-      </a>
-    </td>
-    <td width="50%" valign="top">
-      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
-      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
-    </td>
-  </tr>
-  <tr>
-    <td width="50%" valign="top">
-      <a href="https://github.com/mudler/LocalRecall">
-        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
-      </a>
-    </td>
-    <td width="50%" valign="top">
-      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
-      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
-    </td>
-  </tr>
-</table>
-
-## Screenshots
-
-
-| Talk Interface | Generate Audio |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
-
-| Models Overview | Generate Images |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
-
-| Chat Interface | Home |
-| --- | --- |
-| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
-
-| Login | Swarm |
-| --- | --- |
-|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
-
-## 💻 Quickstart
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)

 Run the installer script:

@@ -107,21 +59,17 @@ curl https://localai.io/install.sh | sh
 ```

 Or run with docker:
-
-### CPU only image:
 ```bash
+# CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-```
-### Nvidia GPU:
-```bash
+
+# Nvidia GPU:
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-```
-### CPU and GPU image (bigger size):
-```bash
+
+# CPU and GPU image (bigger size):
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-```
-### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
-```bash
+
+# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```

@@ -140,13 +88,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

-For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -160,6 +105,19 @@ For more information, see [💻 Getting started](https://localai.io/basics/getti

 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+## 🔥🔥 Hot topics (looking for help):
+
+- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
+- Realtime API https://github.com/mudler/LocalAI/issues/3714
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+- Assistant API: https://github.com/mudler/LocalAI/issues/1273
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647
+- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
+
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
+
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -173,10 +131,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

+## 💻 Usage
+
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

 ### 🔗 Community and integrations

--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+embeddings: true
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,57 +1,101 @@
-context_size: 8192
-f16: true
-function:
-  grammar:
-    no_mixed_free_string: true
-    schema_type: llama3.1 # or JSON is supported too (json)
-  response_regex:
-  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
-mmap: true
 name: gpt-4
+mmap: true
 parameters:
-  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
 stopwords:
- <|im_end|>
- <dummy32000>
- <|eot_id|>
- <|end_of_text|>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
-    {{.Input }}
-    <|start_header_id|>assistant<|end_header_id|>
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
-    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-    {{ if .FunctionCall -}}
-    {{ else if eq .RoleName "tool" -}}
-    The Function was executed and the response was:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content -}}
-    {{ else if .FunctionCall -}}
-    {{ range .FunctionCall }}
-    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
-    {{ end }}
-    {{ end -}}
-    <|eot_id|>
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
-    <|start_header_id|>system<|end_header_id|>
-    You are an expert in composing functions. You are given a question and a set of possible functions.
-    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
-    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
-    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
-    You SHOULD NOT include any other text in the response.
-    Here is a list of functions in JSON format that you can invoke.
-    {{toJson .Functions}}
-    <|eot_id|><|start_header_id|>user<|end_header_id|>
-    {{.Input}}
-    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-download_files:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
-  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  function: |-
+    <|im_start|>system
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,49 +1,31 @@
+backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: bakllava.gguf
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2

 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,53 +1,101 @@
-context_size: 4096
-f16: true
-function:
-  capture_llm_results:
-  - (?s)<Thought>(.*?)</Thought>
-  grammar:
-    properties_order: name,arguments
-  json_regex_match:
-  - (?s)<Output>(.*?)</Output>
-  replace_llm_results:
-  - key: (?s)<Thought>(.*?)</Thought>
-    value: ""
-mmap: true
 name: gpt-4
+mmap: true
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
 stopwords:
- <|im_end|>
- <dummy32000>
- </s>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
+    {{- end }}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
-
-download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
-  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
-  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+    <|im_start|>assistant
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,49 +1,35 @@
+backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
-embeddings: true
 name: text-embedding-ada-002
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2

 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,53 +1,103 @@
-context_size: 4096
-f16: true
-function:
-  capture_llm_results:
-  - (?s)<Thought>(.*?)</Thought>
-  grammar:
-    properties_order: name,arguments
-  json_regex_match:
-  - (?s)<Output>(.*?)</Output>
-  replace_llm_results:
-  - key: (?s)<Thought>(.*?)</Thought>
-    value: ""
-mmap: true
 name: gpt-4
+mmap: false
+context_size: 8192
+
+f16: false
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+
 stopwords:
- <|im_end|>
- <dummy32000>
- </s>
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
+    {{- end }}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-
-download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
-  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
-  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,50 +1,35 @@
+backend: llama-cpp
 context_size: 4096
-f16: true
-mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmap: false
+f16: false
 name: gpt-4o
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
-stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  chat_message: |
-    <|im_start|>{{ .RoleName }}
-    {{ if .FunctionCall -}}
-    Function call:
-    {{ else if eq .RoleName "tool" -}}
-    Function response:
-    {{ end -}}
-    {{ if .Content -}}
-    {{.Content }}
-    {{ end -}}
-    {{ if .FunctionCall -}}
-    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
-  completion: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
-  function: |
-    <|im_start|>system
-    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    For each function call return a json object with function name and arguments
-    <|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
-
+    ASSISTANT:

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -165,6 +165,7 @@ message Reply {

 message GrammarTrigger {
  string word = 1;
+  bool at_start = 2;
 }

 message ModelOptions {
@@ -190,7 +191,11 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-
+  // AutoGPTQ
+  string Device = 22;
+  bool UseTriton = 23;
+  string ModelBaseName = 24;
+  bool UseFastTokenizer = 25;

  // Diffusers
  string PipelineType = 26;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl" \
-		-DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl"
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif

 llama.cpp:
@@ -84,4 +77,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -217,7 +217,6 @@ struct llama_client_slot

    bool infill = false;
    bool embedding = false;
-    bool reranker = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -468,10 +467,9 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
-    bool has_gpu = false;

    bool grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<common_grammar_trigger> grammar_trigger_words;

    int32_t n_ctx;  // total context for all clients / slots

@@ -510,15 +508,12 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.path.empty()) {
+        if (!params.mmproj.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
-                /* use_gpu */ has_gpu,
-                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
-            });
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }

@@ -532,16 +527,10 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.path.c_str());
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }

-        // Enable reranking if embeddings are enabled - moved after context initialization
-        if (params.embedding) {
-            params.reranking = true;
-            LOG_INFO("Reranking enabled (embeddings are enabled)", {});
-        }
-
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_model_n_embd(model);
@@ -720,7 +709,7 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-        slot->sparams.grammar_triggers = grammar_triggers;
+        slot->sparams.grammar_trigger_words = grammar_trigger_words;
        slot->sparams.grammar_lazy = grammar_lazy;

        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
@@ -1361,7 +1350,7 @@ struct llama_server_context
        queue_results.send(res);
    }

-    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
+    void send_embedding(llama_client_slot &slot)
    {
        task_result res;
        res.id = slot.task_id;
@@ -1383,96 +1372,16 @@ struct llama_server_context
        else
        {
            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embd_res(n_embd, 0.0f);
-            std::vector<std::vector<float>> embedding;
-            for (int i = 0; i < batch.n_tokens; ++i) {
-                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-                    continue;
-                }
-
-                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-                if (embd == NULL) {
-                    embd = llama_get_embeddings_ith(ctx, i);
-                }
-
-                if (embd == NULL) {
-                    LOG("failed to get embeddings");
-
-                    continue;
-                }
-
-                // normalize only when there is pooling
-                // TODO: configurable
-                if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
-                    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
-                    embedding.push_back(embd_res);
-                } else {
-                    embedding.push_back({ embd, embd + n_embd });
-                }
-            }
-
-            // OAI compat
+            std::vector<float> embedding(data, data + n_embd);
            res.result_json = json
            {
-                {"embedding", embedding[0] },
+                {"embedding", embedding },
            };
        }
        queue_results.send(res);
    }

-    void send_rerank(llama_client_slot &slot, const llama_batch & batch)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = true;
-
-        float score = -1e6f; // Default score if we fail to get embeddings
-
-        if (!params.reranking)
-        {
-            LOG_WARNING("reranking disabled", {
-                {"params.reranking", params.reranking},
-            });
-        }
-        else if (ctx == nullptr)
-        {
-            LOG_ERR("context is null, cannot perform reranking");
-            res.error = true;
-        }
-        else
-        {
-            for (int i = 0; i < batch.n_tokens; ++i) {
-                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-                    continue;
-                }
-
-                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-                if (embd == NULL) {
-                    embd = llama_get_embeddings_ith(ctx, i);
-                }
-
-                if (embd == NULL) {
-                    LOG("failed to get embeddings");
-                    continue;
-                }
-
-                score = embd[0];
-            }
-        }
-
-        // Format result as JSON similar to the embedding function
-        res.result_json = json
-        {
-            {"score", score},
-            {"tokens", slot.num_prompt_tokens}
-        };
-        
-        queue_results.send(res);
-    }
-
-    void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
+    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
    {
        task_server task;
        task.id = task_id;
@@ -1480,7 +1389,6 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
-        task.reranking_mode = rerank;
        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;

@@ -1612,7 +1520,7 @@ struct llama_server_context
            subtask_data["prompt"] = subtask_data["prompt"][i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
        }
    }

@@ -1651,7 +1559,6 @@ struct llama_server_context

                slot->infill       = task.infill_mode;
                slot->embedding    = task.embedding_mode;
-                slot->reranker    = task.reranking_mode;
                slot->task_id      = task.id;
                slot->multitask_id = task.multitask_id;

@@ -2089,15 +1996,7 @@ struct llama_server_context
                // prompt evaluated for embedding
                if (slot.embedding)
                {
-                    send_embedding(slot, batch_view);
-                    slot.release();
-                    slot.i_batch = -1;
-                    continue;
-                }
-
-                if (slot.reranker)
-                {
-                    send_rerank(slot, batch_view);
+                    send_embedding(slot);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
@@ -2191,11 +2090,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }

 std::function<void(int)> shutdown_handler;
-
-inline void signal_handler(int signal) {
-    exit(1);
-}
-
+inline void signal_handler(int signal) { shutdown_handler(signal); }

 /////////////////////////////////
 ////////////////////////////////
@@ -2391,15 +2286,15 @@ static std::string get_all_kv_cache_types() {
 }

 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params, llama_server_context &llama) {
+                                common_params & params) {
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

-    params.model.path = request->modelfile();
+    params.model = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
-      params.mmproj.path = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+      params.mmproj = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2429,20 +2324,6 @@ static void params_parse(const backend::ModelOptions* request,
        add_rpc_devices(std::string(llama_grpc_servers));
    }
    
-     // decode options. Options are in form optname:optvale, or if booleans only optname.
-    for (int i = 0; i < request->options_size(); i++) {
-        std::string opt = request->options(i);
-        char *optname = strtok(&opt[0], ":");
-        char *optval = strtok(NULL, ":");
-        if (optval == NULL) {
-            optval = "true";
-        }
-
-        if (!strcmp(optname, "gpu")) {
-            llama.has_gpu = true;
-        }
-    }
-
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
@@ -2474,7 +2355,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2512,12 +2393,12 @@ static void params_parse(const backend::ModelOptions* request,
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
-	    trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
-            trigger.value = request->grammartriggers(i).word();
-	    // trigger.at_start = request->grammartriggers(i).at_start();
-            llama.grammar_triggers.push_back(trigger);
+            trigger.word = request->grammartriggers(i).word();
+            trigger.at_start = request->grammartriggers(i).at_start();
+            llama.grammar_trigger_words.push_back(trigger);
            LOG_INFO("grammar trigger", {
-                { "word", trigger.value },
+                { "word", trigger.word },
+                { "at_start", trigger.at_start }
            });
        }
    }
@@ -2536,7 +2417,7 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
    common_params params;
-    params_parse(request, params, llama);
+    params_parse(request, params);

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2558,7 +2439,7 @@ public:
        json data = parse_options(true, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, false, -1);
+        llama.request_completion(task_id, data, false, false, -1);
        while (true)
        {
            task_result result = llama.queue_results.recv(task_id);
@@ -2612,7 +2493,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, false, -1);
+        llama.request_completion(task_id, data, false, false, -1);
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
@@ -2649,7 +2530,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
        // get the result
        task_result result = llama.queue_results.recv(task_id);
        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
@@ -2681,46 +2562,6 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
-        // Create a JSON object with the query and documents
-        json data = {
-            {"prompt", request->query()},
-            {"documents", request->documents()},
-            {"top_n", request->top_n()}
-        };
-
-        // Generate a new task ID
-        const int task_id = llama.queue_tasks.get_new_id();
-        llama.queue_results.add_waiting_task_id(task_id);
-
-        // Queue the task with reranking mode enabled
-        llama.request_completion(task_id, data, false, false, true, -1);
-
-        // Get the result
-        task_result result = llama.queue_results.recv(task_id);
-        llama.queue_results.remove_waiting_task_id(task_id);
-
-        if (!result.error && result.stop) {
-            // Set usage information
-            backend::Usage* usage = rerankResult->mutable_usage();
-            usage->set_total_tokens(result.result_json.value("tokens", 0));
-            usage->set_prompt_tokens(result.result_json.value("tokens", 0));
-
-            // Get the score from the result
-            float score = result.result_json.value("score", 0.0f);
-
-            // Create document results for each input document
-            for (int i = 0; i < request->documents_size(); i++) {
-                backend::DocumentResult* doc_result = rerankResult->add_results();
-                doc_result->set_index(i);
-                doc_result->set_text(request->documents(i));
-                doc_result->set_relevance_score(score);
-            }
-        }
-
-        return grpc::Status::OK;
-    }
-
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();

@@ -2753,9 +2594,7 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
-  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
-  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
+
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
@@ -2764,20 +2603,6 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
-#elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -21,7 +21,6 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -61,7 +61,6 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
-    bool reranking_mode = false;
    int multitask_id = -1;
 };

--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,13 +8,6 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

-GOCMD?=go
-CGO_LDFLAGS?=
-# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
-CGO_LDFLAGS_SYCL=
-GO_TAGS?=
-LD_FLAGS?=
-
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

@@ -28,7 +21,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
@@ -43,35 +36,16 @@ else ifeq ($(OS),Darwin)
 	endif
 endif

-ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DSD_SYCL=ON \
-		-DGGML_SYCL_F16=ON
-	CC=icx
-	CXX=icpx
-	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
-	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
-	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
-endif
+# ifeq ($(BUILD_TYPE),sycl_f16)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
+# endif

-ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DSD_SYCL=ON
-	CC=icx
-	CXX=icpx
-	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
-	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
-	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
-endif
+# ifeq ($(BUILD_TYPE),sycl_f32)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
+# endif

 # warnings
-# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -112,24 +86,11 @@ endif
 	$(MAKE) $(COMBINED_LIB)

 gosd.o:
-ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	+bash -c "source $(ONEAPI_VARS); \
-	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
-else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
-endif

 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

-stablediffusion-ggml:
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
-	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
-ifneq ($(UPX),)
-	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
-endif
-
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -0,0 +1,17 @@
+.PHONY: autogptq
+autogptq: protogen
+	bash install.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the autogptq project
+
+```
+make autogptq
+```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+from concurrent import futures
+import argparse
+import signal
+import sys
+import os
+import time
+import base64
+
+import grpc
+import backend_pb2
+import backend_pb2_grpc
+
+from auto_gptq import AutoGPTQForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import TextGenerationPipeline
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            device = "cuda:0"
+            if request.Device != "":
+                device = request.Device
+
+            # support loading local model files
+            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
+
+            # support model `Qwen/Qwen-VL-Chat-Int4`
+            if "qwen-vl" in request.Model.lower():
+                self.model_name = "Qwen-VL-Chat"
+                model = AutoModelForCausalLM.from_pretrained(model_path, 
+                    trust_remote_code=request.TrustRemoteCode,
+                    device_map="auto").eval()
+            else:
+                model = AutoGPTQForCausalLM.from_quantized(model_path,
+                    model_basename=request.ModelBaseName,
+                    use_safetensors=True,
+                    trust_remote_code=request.TrustRemoteCode,
+                    device=device,
+                    use_triton=request.UseTriton,
+                    quantize_config=None)
+            
+            self.model = model
+            self.tokenizer = tokenizer
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.0
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+        top_p = 0.95
+        if request.TopP != 0.0:
+            top_p = request.TopP
+
+        
+        prompt_images = self.recompile_vl_prompt(request)
+        compiled_prompt = prompt_images[0]
+        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
+
+        # Implement Predict RPC
+        pipeline = TextGenerationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer,
+            max_new_tokens=tokens,
+            temperature=request.Temperature,
+            top_p=top_p,
+            repetition_penalty=penalty,
+            )
+        t = pipeline(compiled_prompt)[0]["generated_text"]
+        print(f"generated_text: {t}", file=sys.stderr)
+        
+        if compiled_prompt in t:
+            t = t.replace(compiled_prompt, "")
+        # house keeping. Remove the image files from /tmp folder
+        for img_path in prompt_images[1]:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+    def recompile_vl_prompt(self, request):
+        prompt = request.Prompt
+        image_paths = []
+
+        if "qwen-vl" in self.model_name.lower():
+            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
+            # Then, save the image file paths to an array "image_paths".
+            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
+            for i, img in enumerate(request.Images):
+                timestamp = str(int(time.time() * 1000))  # Generate timestamp
+                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
+                with open(img_path, "wb") as f:
+                    f.write(base64.b64decode(img))
+                image_paths.append(img_path)
+                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
+        else:
+            prompt = request.Prompt
+        return (prompt, image_paths)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -0,0 +1 @@
+torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
+optimum[openvino]
+setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+auto-gptq==0.7.1
+grpcio==1.70.0
+protobuf
+certifi
+transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc

 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -287,12 +287,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
-            elif request.PipelineType == "Lumina2Text2ImgPipeline":
-                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
-                    request.Model,
-                    torch_dtype=torch.bfloat16)
-                if request.LowVRAM:
-                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@@ -522,12 +516,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.71.0
+grpcio==1.70.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,12 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -62,12 +62,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -99,12 +99,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@@ -91,12 +91,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -559,12 +559,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -320,12 +320,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}

-	if options.LoadToMemory != nil && !options.SingleBackend {
+	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,7 +53,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
@@ -117,11 +116,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		}

 		if tokenCallback != nil {
-
-			if c.TemplateConfig.ReplyPrefix != "" {
-				tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
-			}
-
 			ss := ""

 			var partialRune []byte
@@ -171,13 +165,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
 			tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

-			response := string(reply.Message)
-			if c.TemplateConfig.ReplyPrefix != "" {
-				response = c.TemplateConfig.ReplyPrefix + response
-			}
-
 			return LLMResponse{
-				Response: response,
+				Response: string(reply.Message),
 				Usage:    tokenUsage,
 			}, err
 		}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,6 +40,10 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

+	if so.SingleBackend {
+		defOpts = append(defOpts, model.WithSingleActiveBackend())
+	}
+
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -117,7 +121,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word: t.Word,
+			Word:    t.Word,
+			AtStart: t.AtStart,
 		})

 	}
@@ -157,33 +162,38 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:              c.MMProj,
-		FlashAttention:      c.FlashAttention,
-		CacheTypeKey:        c.CacheTypeK,
-		CacheTypeValue:      c.CacheTypeV,
-		NoKVOffload:         c.NoKVOffloading,
-		YarnExtFactor:       c.YarnExtFactor,
-		YarnAttnFactor:      c.YarnAttnFactor,
-		YarnBetaFast:        c.YarnBetaFast,
-		YarnBetaSlow:        c.YarnBetaSlow,
-		NGQA:                c.NGQA,
-		RMSNormEps:          c.RMSNormEps,
-		MLock:               mmlock,
-		RopeFreqBase:        c.RopeFreqBase,
-		RopeScaling:         c.RopeScaling,
-		Type:                c.ModelType,
-		RopeFreqScale:       c.RopeFreqScale,
-		NUMA:                c.NUMA,
-		Embeddings:          embeddings,
-		LowVRAM:             lowVRAM,
-		NGPULayers:          int32(nGPULayers),
-		MMap:                mmap,
-		MainGPU:             c.MainGPU,
-		Threads:             int32(*c.Threads),
-		TensorSplit:         c.TensorSplit,
+		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:               c.MMProj,
+		FlashAttention:       c.FlashAttention,
+		CacheTypeKey:         c.CacheTypeK,
+		CacheTypeValue:       c.CacheTypeV,
+		NoKVOffload:          c.NoKVOffloading,
+		YarnExtFactor:        c.YarnExtFactor,
+		YarnAttnFactor:       c.YarnAttnFactor,
+		YarnBetaFast:         c.YarnBetaFast,
+		YarnBetaSlow:         c.YarnBetaSlow,
+		NGQA:                 c.NGQA,
+		RMSNormEps:           c.RMSNormEps,
+		MLock:                mmlock,
+		RopeFreqBase:         c.RopeFreqBase,
+		RopeScaling:          c.RopeScaling,
+		Type:                 c.ModelType,
+		RopeFreqScale:        c.RopeFreqScale,
+		NUMA:                 c.NUMA,
+		Embeddings:           embeddings,
+		LowVRAM:              lowVRAM,
+		NGPULayers:           int32(nGPULayers),
+		MMap:                 mmap,
+		MainGPU:              c.MainGPU,
+		Threads:              int32(*c.Threads),
+		TensorSplit:          c.TensorSplit,
+		// AutoGPTQ
+		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		Device:           c.AutoGPTQ.Device,
+		UseTriton:        c.AutoGPTQ.Triton,
+		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,10 +26,10 @@ func SoundGeneration(

 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return "", nil, err
 	}
-	defer loader.Close()

 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,7 +20,6 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
-	defer loader.Close()

 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac

 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
+
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
-	defer loader.Close()

 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
-	defer ml.Close()

 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
+
 	if err != nil {
 		return "", nil, err
 	}
-	defer loader.Close()

 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,8 +19,6 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
-	defer ml.Close()
-
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -38,7 +38,7 @@ type RunCMD struct {

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -50,6 +50,9 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

+	// AutoGPTQ specifics
+	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
+
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@@ -127,28 +130,28 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

-	ContextSize          *int             `yaml:"context_size"`
-	NUMA                 bool             `yaml:"numa"`
-	LoraAdapter          string           `yaml:"lora_adapter"`
-	LoraBase             string           `yaml:"lora_base"`
-	LoraAdapters         []string         `yaml:"lora_adapters"`
-	LoraScales           []float32        `yaml:"lora_scales"`
-	LoraScale            float32          `yaml:"lora_scale"`
-	NoMulMatQ            bool             `yaml:"no_mulmatq"`
-	DraftModel           string           `yaml:"draft_model"`
-	NDraft               int32            `yaml:"n_draft"`
-	Quantization         string           `yaml:"quantization"`
-	LoadFormat           string           `yaml:"load_format"`
-	GPUMemoryUtilization float32          `yaml:"gpu_memory_utilization"` // vLLM
-	TrustRemoteCode      bool             `yaml:"trust_remote_code"`      // vLLM
-	EnforceEager         bool             `yaml:"enforce_eager"`          // vLLM
-	SwapSpace            int              `yaml:"swap_space"`             // vLLM
-	MaxModelLen          int              `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int              `yaml:"tensor_parallel_size"`   // vLLM
-	DisableLogStatus     bool             `yaml:"disable_log_stats"`      // vLLM
-	DType                string           `yaml:"dtype"`                  // vLLM
-	LimitMMPerPrompt     LimitMMPerPrompt `yaml:"limit_mm_per_prompt"`    // vLLM
-	MMProj               string           `yaml:"mmproj"`
+	ContextSize          *int               `yaml:"context_size"`
+	NUMA                 bool               `yaml:"numa"`
+	LoraAdapter          string             `yaml:"lora_adapter"`
+	LoraBase             string             `yaml:"lora_base"`
+	LoraAdapters         []string           `yaml:"lora_adapters"`
+	LoraScales           []float32          `yaml:"lora_scales"`
+	LoraScale            float32            `yaml:"lora_scale"`
+	NoMulMatQ            bool               `yaml:"no_mulmatq"`
+	DraftModel           string             `yaml:"draft_model"`
+	NDraft               int32              `yaml:"n_draft"`
+	Quantization         string             `yaml:"quantization"`
+	LoadFormat           string             `yaml:"load_format"`
+	GPUMemoryUtilization float32            `yaml:"gpu_memory_utilization"` // vLLM
+	TrustRemoteCode      bool               `yaml:"trust_remote_code"`      // vLLM
+	EnforceEager         bool               `yaml:"enforce_eager"`          // vLLM
+	SwapSpace            int                `yaml:"swap_space"`             // vLLM
+	MaxModelLen          int                `yaml:"max_model_len"`          // vLLM
+	TensorParallelSize   int                `yaml:"tensor_parallel_size"`   // vLLM
+	DisableLogStatus     bool               `yaml:"disable_log_stats"`      // vLLM
+	DType                string             `yaml:"dtype"`                  // vLLM
+	LimitMMPerPrompt     LimitMMPerPrompt   `yaml:"limit_mm_per_prompt"`    // vLLM
+	MMProj               string             `yaml:"mmproj"`

 	FlashAttention bool   `yaml:"flash_attention"`
 	NoKVOffloading bool   `yaml:"no_kv_offloading"`
@@ -168,9 +171,17 @@ type LLMConfig struct {

 // LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
 type LimitMMPerPrompt struct {
-	LimitImagePerPrompt int `yaml:"image"`
-	LimitVideoPerPrompt int `yaml:"video"`
-	LimitAudioPerPrompt int `yaml:"audio"`
+	LimitImagePerPrompt   int   `yaml:"image"`
+	LimitVideoPerPrompt   int   `yaml:"video"`
+	LimitAudioPerPrompt   int   `yaml:"audio"`
+}
+
+// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
+type AutoGPTQ struct {
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

 // TemplateConfig is a struct that holds the configuration of the templating system
@@ -202,8 +213,6 @@ type TemplateConfig struct {
 	Multimodal string `yaml:"multimodal"`

 	JinjaTemplate bool `yaml:"jinja_template"`
-
-	ReplyPrefix string `yaml:"reply_prefix"`
 }

 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
@@ -378,6 +387,16 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}

+	// Value passed by the top level are treated as default (no implicit defaults)
+	// defaults are set by the user
+	if ctx == 0 {
+		ctx = 1024
+	}
+
+	if cfg.ContextSize == nil {
+		cfg.ContextSize = &ctx
+	}
+
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -399,7 +418,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}

-	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
+	guessDefaultsFromFile(cfg, lo.modelPath)
 }

 func (c *BackendConfig) Validate() bool {
@@ -544,7 +563,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
+		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -1,253 +0,0 @@
-package config
-
-import (
-	"strings"
-
-	"github.com/rs/zerolog/log"
-
-	gguf "github.com/thxcode/gguf-parser-go"
-)
-
-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
-const (
-	defaultContextSize = 1024
-)
-
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
-func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
-
-	if defaultCtx == 0 && cfg.ContextSize == nil {
-		ctxSize := f.EstimateLLaMACppUsage().ContextSize
-		if ctxSize > 0 {
-			cSize := int(ctxSize)
-			cfg.ContextSize = &cSize
-		} else {
-			defaultCtx = defaultContextSize
-			cfg.ContextSize = &defaultCtx
-		}
-	}
-
-	if cfg.HasTemplate() {
-		// nothing to guess here
-		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
-		return
-	}
-
-	log.Debug().
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
-		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
-
-	// guess the name
-	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
-	}
-
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
-		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
-	}
-}
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -3,12 +3,147 @@ package config
 import (
 	"os"
 	"path/filepath"
+	"strings"

 	"github.com/rs/zerolog/log"
+
 	gguf "github.com/thxcode/gguf-parser-go"
 )

-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+	DeepSeek2
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+	RepeatPenalty  float64
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		RepeatPenalty: 1.0,
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<start_of_turn>model\n",
+			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
+
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@@ -19,20 +154,106 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
 		return
 	}

-	// We try to guess only if we don't have a template defined already
-	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
-
-	// try to parse the gguf file
-	f, err := gguf.ParseGGUFFile(guessPath)
-	if err == nil {
-		guessGGUFFromFile(cfg, f, defaultCtx)
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 		return
 	}

-	if cfg.ContextSize == nil {
-		if defaultCtx == 0 {
-			defaultCtx = defaultContextSize
+	// We try to guess only if we don't have a template defined already
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+	f, err := gguf.ParseGGUFFile(guessPath)
+	if err != nil {
+		// Only valid for gguf files
+		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
 		}
-		cfg.ContextSize = &defaultCtx
+		if cfg.RepeatPenalty == 0.0 {
+			cfg.RepeatPenalty = settings.RepeatPenalty
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	deepseek2 := arch == "deepseek2"
+
+	switch {
+	case deepseek2:
+		return DeepSeek2
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
 	}
 }
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -139,28 +139,6 @@ func API(application *application.Application) (*fiber.App, error) {
 		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}

-	httpFS := http.FS(embedDirStatic)
-
-	router.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
-		FileSystem: httpFS,
-		File:       "static/favicon.svg",
-	}))
-
-	router.Use("/static", filesystem.New(filesystem.Config{
-		Root:       httpFS,
-		PathPrefix: "static",
-		Browse:     true,
-	}))
-
-	if application.ApplicationConfig().ImageDir != "" {
-		router.Static("/generated-images", application.ApplicationConfig().ImageDir)
-	}
-
-	if application.ApplicationConfig().AudioDir != "" {
-		router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
-	}
-
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
 	router.Use(v2keyauth.New(*kaConfig))

@@ -198,6 +176,20 @@ func API(application *application.Application) (*fiber.App, error) {
 	}
 	routes.RegisterJINARoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())

+	httpFS := http.FS(embedDirStatic)
+
+	router.Use(favicon.New(favicon.Config{
+		URL:        "/favicon.ico",
+		FileSystem: httpFS,
+		File:       "static/favicon.ico",
+	}))
+
+	router.Use("/static", filesystem.New(filesystem.Config{
+		Root:       httpFS,
+		PathPrefix: "static",
+		Browse:     true,
+	}))
+
 	// Define a custom 404 handler
 	// Note: keep this at the bottom!
 	router.Use(notFoundHandler)
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
+			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
+				"class": "relative p-4 w-full max-w-2xl max-h-full",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
+					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
 				},
 				// header
 				elem.Div(
@@ -164,13 +164,14 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
+						"class": "p-4 md:p-5 space-y-4",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
+							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@@ -231,6 +232,7 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
+
 }

 func modelDescription(m *gallery.GalleryModel) elem.Node {
--- a/core/http/elements/p2p.go
+++ b/core/http/elements/p2p.go
@@ -2,7 +2,6 @@ package elements

 import (
 	"fmt"
-	"time"

 	"github.com/chasefleming/elem-go"
 	"github.com/chasefleming/elem-go/attrs"
@@ -19,6 +18,19 @@ func renderElements(n []elem.Node) string {
 }

 func P2PNodeStats(nodes []p2p.NodeData) string {
+	/*
+	   <div class="bg-gray-800 p-6 rounded-lg shadow-lg text-left">
+	                       <p class="text-xl font-semibold text-gray-200">Total Workers Detected: {{ len .Nodes }}</p>
+	                       {{ $online := 0 }}
+	                       {{ range .Nodes }}
+	                           {{ if .IsOnline }}
+	                               {{ $online = add $online 1 }}
+	                           {{ end }}
+	                       {{ end }}
+	                       <p class="text-xl font-semibold text-gray-200">Total Online Workers: {{$online}}</p>
+	                   </div>
+	*/
+
 	online := 0
 	for _, n := range nodes {
 		if n.IsOnline() {
@@ -26,21 +38,27 @@ func P2PNodeStats(nodes []p2p.NodeData) string {
 		}
 	}

-	class := "text-blue-400"
+	class := "text-green-500"
 	if online == 0 {
-		class = "text-red-400"
+		class = "text-red-500"
 	}
-
+	/*
+	   <i class="fas fa-circle animate-pulse text-green-500 ml-2 mr-1"></i>
+	*/
+	circle := elem.I(attrs.Props{
+		"class": "fas fa-circle animate-pulse " + class + " ml-2 mr-1",
+	})
 	nodesElements := []elem.Node{
 		elem.Span(
 			attrs.Props{
-				"class": class + " font-bold text-xl",
+				"class": class,
 			},
+			circle,
 			elem.Text(fmt.Sprintf("%d", online)),
 		),
 		elem.Span(
 			attrs.Props{
-				"class": "text-gray-300 text-xl",
+				"class": "text-gray-200",
 			},
 			elem.Text(fmt.Sprintf("/%d", len(nodes))),
 		),
@@ -50,73 +68,77 @@ func P2PNodeStats(nodes []p2p.NodeData) string {
 }

 func P2PNodeBoxes(nodes []p2p.NodeData) string {
+	/*
+			<div class="bg-gray-800 p-4 rounded-lg shadow-lg text-left">
+			<div class="flex items-center mb-2">
+				<i class="fas fa-desktop text-gray-400 mr-2"></i>
+				<span class="text-gray-200 font-semibold">{{.ID}}</span>
+			</div>
+			<p class="text-sm text-gray-400 mt-2 flex items-center">
+				Status:
+				<i class="fas fa-circle {{ if .IsOnline }}text-green-500{{ else }}text-red-500{{ end }} ml-2 mr-1"></i>
+				<span class="{{ if .IsOnline }}text-green-400{{ else }}text-red-400{{ end }}">
+					{{ if .IsOnline }}Online{{ else }}Offline{{ end }}
+				</span>
+			</p>
+		</div>
+	*/
+
 	nodesElements := []elem.Node{}

 	for _, n := range nodes {
-		nodeID := bluemonday.StrictPolicy().Sanitize(n.ID)
-
-		// Define status-specific classes
-		statusIconClass := "text-green-400"
-		statusText := "Online"
-		statusTextClass := "text-green-400"
-
-		if !n.IsOnline() {
-			statusIconClass = "text-red-400"
-			statusText = "Offline"
-			statusTextClass = "text-red-400"
-		}

 		nodesElements = append(nodesElements,
 			elem.Div(
 				attrs.Props{
-					"class": "bg-gray-800/80 border border-gray-700/50 rounded-xl p-4 shadow-lg transition-all duration-300 hover:shadow-blue-900/20 hover:border-blue-700/50",
+					"class": "bg-gray-700 p-6 rounded-lg shadow-lg text-left",
 				},
-				// Node ID and status indicator in top row
-				elem.Div(
+				elem.P(
 					attrs.Props{
-						"class": "flex items-center justify-between mb-3",
+						"class": "text-sm text-gray-400 mt-2 flex",
 					},
-					// Node ID with icon
-					elem.Div(
+					elem.I(
 						attrs.Props{
-							"class": "flex items-center",
+							"class": "fas fa-desktop text-gray-400 mr-2",
 						},
+					),
+					elem.Text("Name: "),
+					elem.Span(
+						attrs.Props{
+							"class": "text-gray-200 font-semibold ml-2 mr-1",
+						},
+						elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
+					),
+					elem.Text("Status: "),
+					elem.If(
+						n.IsOnline(),
 						elem.I(
 							attrs.Props{
-								"class": "fas fa-server text-blue-400 mr-2",
+								"class": "fas fa-circle animate-pulse text-green-500 ml-2 mr-1",
 							},
 						),
+						elem.I(
+							attrs.Props{
+								"class": "fas fa-circle animate-pulse text-red-500 ml-2 mr-1",
+							},
+						),
+					),
+					elem.If(
+						n.IsOnline(),
+						elem.Span(
+							attrs.Props{
+								"class": "text-green-400",
+							},
+
+							elem.Text("Online"),
+						),
 						elem.Span(
 							attrs.Props{
-								"class": "text-white font-medium",
+								"class": "text-red-400",
 							},
-							elem.Text(nodeID),
+							elem.Text("Offline"),
 						),
 					),
-					// Status indicator
-					elem.Div(
-						attrs.Props{
-							"class": "flex items-center",
-						},
-						elem.I(
-							attrs.Props{
-								"class": "fas fa-circle animate-pulse " + statusIconClass + " mr-1.5",
-							},
-						),
-						elem.Span(
-							attrs.Props{
-								"class": statusTextClass,
-							},
-							elem.Text(statusText),
-						),
-					),
-				),
-				// Bottom section with timestamp
-				elem.Div(
-					attrs.Props{
-						"class": "text-xs text-gray-400 pt-1 border-t border-gray-700/30",
-					},
-					elem.Text("Last updated: "+time.Now().UTC().Format("2006-01-02 15:04:05")),
 				),
 			))
 	}
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,7 +21,6 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -49,7 +48,6 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -71,7 +69,6 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -103,7 +100,6 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
-		defer sl.Close()

 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath, false)
+	var ml = model.NewModelLoader(modelPath)

 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
 	httpFS := http.FS(embedDirStatic)

 	app.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
+		URL:        "/favicon.ico",
 		FileSystem: httpFS,
-		File:       "static/favicon.svg",
+		File:       "static/favicon.ico",
 	}))

 	app.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -203,10 +203,18 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}

+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -50,10 +50,11 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)

 	// Stores
-	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))
+	sl := model.NewModelLoader("")
+	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))

 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -112,6 +112,14 @@ func RegisterOpenAIRoutes(app *fiber.App,
 		re.SetOpenAIRequest,
 		openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))

+	if application.ApplicationConfig().ImageDir != "" {
+		app.Static("/generated-images", application.ApplicationConfig().ImageDir)
+	}
+
+	if application.ApplicationConfig().AudioDir != "" {
+		app.Static("/generated-audio", application.ApplicationConfig().AudioDir)
+	}
+
 	// List models
 	app.Get("/v1/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -173,6 +173,7 @@ func RegisterUIRoutes(app *fiber.App,
 			}

 			if page != "" {
+				log.Debug().Msgf("page : %+v\n", page)
 				// return a subset of the models
 				pageNum, err := strconv.Atoi(page)
 				if err != nil {
@@ -192,6 +193,7 @@ func RegisterUIRoutes(app *fiber.App,

 				models = models.Paginate(pageNum, itemsNum)

+				log.Debug().Msgf("number of models : %+v\n", len(models))
 				prevPage := pageNum - 1
 				nextPage := pageNum + 1
 				if prevPage < 1 {
@@ -550,7 +552,7 @@ func RegisterUIRoutes(app *fiber.App,
 		title := "LocalAI - Generate audio"

 		for _, b := range backendConfigs {
-			if b.HasUsecases(config.FLAG_TTS) {
+			if b.HasUsecases(config.FLAG_CHAT) {
 				modelThatCanBeUsed = b.Name
 				title = "LocalAI - Generate audio with " + modelThatCanBeUsed
 				break
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -42,6 +42,12 @@ function toggleLoader(show) {
  }
 }

+function submitKey(event) {
+    event.preventDefault();
+    localStorage.setItem("key", document.getElementById("apiKey").value);
+    document.getElementById("apiKey").blur();
+}
+
 function submitSystemPrompt(event) {
  event.preventDefault();
  localStorage.setItem("system_prompt", document.getElementById("systemPrompt").value);
@@ -56,9 +62,10 @@ function submitPrompt(event) {
  const input = document.getElementById("input").value;
  Alpine.store("chat").add("user", input, image);
  document.getElementById("input").value = "";
+  const key = localStorage.getItem("key");
  const systemPrompt = localStorage.getItem("system_prompt");
  Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });
-  promptGPT(systemPrompt, input);
+  promptGPT(systemPrompt, key, input);
 }

 function readInputImage() {
@@ -75,7 +82,7 @@ function readInputImage() {
 }


-  async function promptGPT(systemPrompt, input) {
+  async function promptGPT(systemPrompt, key, input) {
    const model = document.getElementById("chat-model").value;
    // Set class "loader" to the element with "loader" id
    //document.getElementById("loader").classList.add("loader");
@@ -153,6 +160,7 @@ function readInputImage() {
    const response = await fetch("v1/chat/completions", {
      method: "POST",
      headers: {
+        Authorization: `Bearer ${key}`,
        "Content-Type": "application/json",
      },
      body: JSON.stringify({
@@ -258,12 +266,20 @@ function readInputImage() {
    document.getElementById("input").focus();
  }

+  document.getElementById("key").addEventListener("submit", submitKey);
  document.getElementById("system_prompt").addEventListener("submit", submitSystemPrompt);

  document.getElementById("prompt").addEventListener("submit", submitPrompt);
  document.getElementById("input").focus();
  document.getElementById("input_image").addEventListener("change", readInputImage);

+  storeKey = localStorage.getItem("key");
+  if (storeKey) {
+    document.getElementById("apiKey").value = storeKey;
+  } else {
+    document.getElementById("apiKey").value = null;
+  }
+
  storesystemPrompt = localStorage.getItem("system_prompt");
  if (storesystemPrompt) {
    document.getElementById("systemPrompt").value = storesystemPrompt;
--- a/core/http/static/favicon.ico
+++ b/core/http/static/favicon.ico
--- a/core/http/static/favicon.svg
+++ b/core/http/static/favicon.svg
--- a/core/http/static/image.js
+++ b/core/http/static/image.js
@@ -1,11 +1,48 @@
+/*
+
+https://github.com/david-haerer/chatapi
+
+MIT License
+
+Copyright (c) 2023 David Härer
+Copyright (c) 2024 Ettore Di Giacinto
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+function submitKey(event) {
+    event.preventDefault();
+    localStorage.setItem("key", document.getElementById("apiKey").value);
+    document.getElementById("apiKey").blur();
+  }
+  
+
 function genImage(event) {
  event.preventDefault();
  const input = document.getElementById("input").value;
+  const key = localStorage.getItem("key");
+
+  promptDallE(key, input);

-  promptDallE(input);
 }
  
-async function promptDallE(input) {
+async function promptDallE(key, input) {
  document.getElementById("loader").style.display = "block";
  document.getElementById("input").value = "";
  document.getElementById("input").disabled = true;
@@ -14,6 +51,7 @@ async function promptDallE(input) {
  const response = await fetch("v1/images/generations", {
    method: "POST",
    headers: {
+      Authorization: `Bearer ${key}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
@@ -46,6 +84,13 @@ async function promptDallE(input) {
  document.getElementById("input").focus();
 }

+document.getElementById("key").addEventListener("submit", submitKey);
 document.getElementById("input").focus();
 document.getElementById("genimage").addEventListener("submit", genImage);
 document.getElementById("loader").style.display = "none";
+
+const storeKey = localStorage.getItem("key");
+if (storeKey) {
+  document.getElementById("apiKey").value = storeKey;
+}
+
--- a/core/http/static/logo.png
+++ b/core/http/static/logo.png
--- a/core/http/static/logo_horizontal.png
+++ b/core/http/static/logo_horizontal.png
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -9,6 +9,10 @@ let isRecording = false;
 let conversationHistory = [];
 let resetTimer;

+function getApiKey() {
+    return document.getElementById('apiKey').value;
+}
+
 function getModel() {
    return document.getElementById('modelSelect').value;
 }
@@ -95,13 +99,34 @@ function stopRecording() {
    };
 }

+function submitKey(event) {
+    event.preventDefault();
+    localStorage.setItem("key", document.getElementById("apiKey").value);
+    document.getElementById("apiKey").blur();
+}
+
+document.getElementById("key").addEventListener("submit", submitKey);
+
+
+storeKey = localStorage.getItem("key");
+if (storeKey) {
+  document.getElementById("apiKey").value = storeKey;
+} else {
+  document.getElementById("apiKey").value = null;
+}
+
+
 async function sendAudioToWhisper(audioBlob) {
    const formData = new FormData();
    formData.append('file', audioBlob);
    formData.append('model', getWhisperModel());
+    API_KEY = localStorage.getItem("key");

    const response = await fetch('v1/audio/transcriptions', {
        method: 'POST',
+        headers: {
+            'Authorization': `Bearer ${API_KEY}`
+        },
        body: formData
    });

@@ -112,10 +137,14 @@ async function sendAudioToWhisper(audioBlob) {

 async function sendTextToChatGPT(text) {
    conversationHistory.push({ role: "user", content: text });
+    API_KEY = localStorage.getItem("key");

    const response = await fetch('v1/chat/completions', {
        method: 'POST',
-        headers: { "Content-Type": "application/json" },
+        headers: {
+            'Authorization': `Bearer ${API_KEY}`,
+            'Content-Type': 'application/json'
+        },
        body: JSON.stringify({
            model: getModel(),
            messages: conversationHistory
@@ -132,10 +161,13 @@ async function sendTextToChatGPT(text) {
 }

 async function getTextToSpeechAudio(text) {
+    API_KEY = localStorage.getItem("key");
+
    const response = await fetch('v1/audio/speech', {
        
        method: 'POST',
        headers: {
+            'Authorization': `Bearer ${API_KEY}`,
            'Content-Type': 'application/json'
        },
        body: JSON.stringify({ 
--- a/core/http/static/tts.js
+++ b/core/http/static/tts.js
@@ -1,204 +1,64 @@
-// Initialize Alpine store for API key management
-document.addEventListener('alpine:init', () => {
-  Alpine.store('chat', {  });
-});
+function submitKey(event) {
+    event.preventDefault();
+    localStorage.setItem("key", document.getElementById("apiKey").value);
+    document.getElementById("apiKey").blur();
+  }
+  

 function genAudio(event) {
  event.preventDefault();
  const input = document.getElementById("input").value;
+  const key = localStorage.getItem("key");

-  if (!input.trim()) {
-    showNotification('error', 'Please enter text to convert to speech');
+  tts(key, input);
+}
+  
+async function tts(key, input) {
+  document.getElementById("loader").style.display = "block";
+  document.getElementById("input").value = "";
+  document.getElementById("input").disabled = true;
+
+  const model = document.getElementById("tts-model").value;
+  const response = await fetch("tts", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${key}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: model,
+      input: input,
+    }),
+  });
+  if (!response.ok) {
+    const jsonData = await response.json(); // Now safely parse JSON
+    var div = document.getElementById('result');
+    div.innerHTML = '<p style="color:red;">Error: ' +jsonData.error.message + '</p>';
    return;
  }

-  tts(input);
-}
+  var div = document.getElementById('result');  // Get the div by its ID
+  var link=document.createElement('a');
+  link.className = "m-2 float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong";
+  link.innerHTML = "<i class='fa-solid fa-download'></i> Download result";
+  const blob = await response.blob();
+  link.href=window.URL.createObjectURL(blob);

-function showNotification(type, message) {
-  // Remove any existing notification
-  const existingNotification = document.getElementById('notification');
-  if (existingNotification) {
-    existingNotification.remove();
-  }
-  
-  // Create new notification
-  const notification = document.createElement('div');
-  notification.id = 'notification';
-  notification.classList.add(
-    'fixed', 'top-24', 'right-4', 'z-50', 'p-4', 'rounded-lg', 'shadow-lg',
-    'transform', 'transition-all', 'duration-300', 'ease-in-out', 'translate-y-0',
-    'flex', 'items-center', 'gap-2'
-  );
-  
-  // Style based on notification type
-  if (type === 'error') {
-    notification.classList.add('bg-red-900/90', 'border', 'border-red-700', 'text-red-200');
-    notification.innerHTML = '<i class="fas fa-circle-exclamation text-red-400 mr-2"></i>' + message;
-  } else if (type === 'warning') {
-    notification.classList.add('bg-yellow-900/90', 'border', 'border-yellow-700', 'text-yellow-200');
-    notification.innerHTML = '<i class="fas fa-triangle-exclamation text-yellow-400 mr-2"></i>' + message;
-  } else if (type === 'success') {
-    notification.classList.add('bg-green-900/90', 'border', 'border-green-700', 'text-green-200');
-    notification.innerHTML = '<i class="fas fa-circle-check text-green-400 mr-2"></i>' + message;
-  } else {
-    notification.classList.add('bg-blue-900/90', 'border', 'border-blue-700', 'text-blue-200');
-    notification.innerHTML = '<i class="fas fa-circle-info text-blue-400 mr-2"></i>' + message;
-  }
-  
-  // Add close button
-  const closeBtn = document.createElement('button');
-  closeBtn.innerHTML = '<i class="fas fa-xmark"></i>';
-  closeBtn.classList.add('ml-auto', 'text-gray-400', 'hover:text-white', 'transition-colors');
-  closeBtn.onclick = () => {
-    notification.classList.add('opacity-0', 'translate-y-[-20px]');
-    setTimeout(() => notification.remove(), 300);
-  };
-  notification.appendChild(closeBtn);
-  
-  // Add to DOM
-  document.body.appendChild(notification);
-  
-  // Animate in
-  setTimeout(() => {
-    notification.classList.add('opacity-0', 'translate-y-[-20px]');
-    notification.offsetHeight; // Force reflow
-    notification.classList.remove('opacity-0', 'translate-y-[-20px]');
-  }, 10);
-  
-  // Auto dismiss after 5 seconds
-  setTimeout(() => {
-    if (document.getElementById('notification')) {
-      notification.classList.add('opacity-0', 'translate-y-[-20px]');
-      setTimeout(() => notification.remove(), 300);
-    }
-  }, 5000);
-}
-
-async function tts(input) {
-  // Show loader and prepare UI
-  const loader = document.getElementById("loader");
-  const inputField = document.getElementById("input");
-  const resultDiv = document.getElementById("result");
-  
-  loader.style.display = "block";
-  inputField.value = "";
-  inputField.disabled = true;
-  resultDiv.innerHTML = '<div class="text-center text-gray-400 italic">Processing your request...</div>';
-
-  // Get the model and make API request
-  const model = document.getElementById("tts-model").value;
-  try {
-    const response = await fetch("tts", {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model: model,
-        input: input,
-      }),
-    });
-    
-    if (!response.ok) {
-      const jsonData = await response.json();
-      resultDiv.innerHTML = `
-        <div class="bg-red-900/30 border border-red-700/50 rounded-lg p-4 text-center">
-          <i class="fas fa-circle-exclamation text-red-400 text-2xl mb-2"></i>
-          <p class="text-red-300 font-medium">${jsonData.error.message || 'An error occurred'}</p>
-        </div>
-      `;
-      showNotification('error', 'Failed to generate audio');
-      return;
-    }
-
-    // Handle successful response
-    const blob = await response.blob();
-    const audioUrl = window.URL.createObjectURL(blob);
-    
-    // Create audio player
-    const audioPlayer = document.createElement('div');
-    audioPlayer.className = 'flex flex-col items-center space-y-4 w-full';
-    
-    // Create audio element with styled controls
-    const audio = document.createElement('audio');
-    audio.controls = true;
-    audio.src = audioUrl;
-    audio.className = 'w-full my-4';
-    audioPlayer.appendChild(audio);
-    
-    // Create action buttons container
-    const actionButtons = document.createElement('div');
-    actionButtons.className = 'flex flex-wrap justify-center gap-3';
-    
-    // Download button
-    const downloadLink = document.createElement('a');
-    downloadLink.href = audioUrl;
-    downloadLink.download = `tts-${model}-${new Date().toISOString().slice(0, 10)}.mp3`;
-    downloadLink.className = 'group flex items-center bg-blue-600 hover:bg-blue-700 text-white py-2 px-4 rounded-lg transition duration-300 ease-in-out transform hover:scale-105 hover:shadow-lg';
-    downloadLink.innerHTML = `
-      <i class="fas fa-download mr-2"></i>
-      <span>Download</span>
-      <i class="fas fa-arrow-right opacity-0 group-hover:opacity-100 group-hover:translate-x-2 ml-2 transition-all duration-300"></i>
-    `;
-    actionButtons.appendChild(downloadLink);
-    
-    // Replay button
-    const replayButton = document.createElement('button');
-    replayButton.className = 'group flex items-center bg-purple-600 hover:bg-purple-700 text-white py-2 px-4 rounded-lg transition duration-300 ease-in-out transform hover:scale-105 hover:shadow-lg';
-    replayButton.innerHTML = `
-      <i class="fas fa-rotate-right mr-2"></i>
-      <span>Replay</span>
-    `;
-    replayButton.onclick = () => audio.play();
-    actionButtons.appendChild(replayButton);
-    
-    // Add text display
-    const textDisplay = document.createElement('div');
-    textDisplay.className = 'mt-4 p-4 bg-gray-800/50 border border-gray-700/50 rounded-lg text-gray-300 text-center italic';
-    textDisplay.textContent = `"${input}"`;
-    
-    // Add all elements to result div
-    audioPlayer.appendChild(actionButtons);
-    resultDiv.innerHTML = '';
-    resultDiv.appendChild(audioPlayer);
-    resultDiv.appendChild(textDisplay);
-    
-    // Play audio automatically
-    audio.play();
-    
-    // Show success notification
-    showNotification('success', 'Audio generated successfully');
-    
-  } catch (error) {
-    console.error('Error generating audio:', error);
-    resultDiv.innerHTML = `
-      <div class="bg-red-900/30 border border-red-700/50 rounded-lg p-4 text-center">
-        <i class="fas fa-circle-exclamation text-red-400 text-2xl mb-2"></i>
-        <p class="text-red-300 font-medium">Network error: Failed to connect to the server</p>
-      </div>
-    `;
-    showNotification('error', 'Network error occurred');
-  } finally {
-    // Reset UI state
-    loader.style.display = "none";
-    inputField.disabled = false;
-    inputField.focus();
-  }
-}
-
-// Set up event listeners when DOM is loaded
-document.addEventListener('DOMContentLoaded', () => {
-  document.getElementById("input").focus();
-  document.getElementById("tts").addEventListener("submit", genAudio);
+  div.innerHTML = '';                             // Clear the existing content of the div
+  div.appendChild(link);                           // Add the new img element to the div
+  console.log(link)
  document.getElementById("loader").style.display = "none";
- 
-  // Add basic keyboard shortcuts
-  document.addEventListener('keydown', (e) => {
-    // Submit on Ctrl+Enter
-    if (e.key === 'Enter' && e.ctrlKey && document.activeElement.id === 'input') {
-      e.preventDefault();
-      document.getElementById("tts").dispatchEvent(new Event('submit'));
-    }
-  });
-});
+  document.getElementById("input").disabled = false;
+  document.getElementById("input").focus();
+}
+
+document.getElementById("key").addEventListener("submit", submitKey);
+document.getElementById("input").focus();
+document.getElementById("tts").addEventListener("submit", genAudio);
+document.getElementById("loader").style.display = "none";
+
+const storeKey = localStorage.getItem("key");
+if (storeKey) {
+  document.getElementById("apiKey").value = storeKey;
+}
+
--- a/core/http/views/404.html
+++ b/core/http/views/404.html
@@ -1,51 +1,28 @@
 <!DOCTYPE html>
 <html lang="en">
+
 {{template "views/partials/head" .}}

-<body class="bg-gradient-to-br from-gray-900 to-gray-950 text-gray-200">
+<body class="bg-black text-white">
 <div class="flex flex-col min-h-screen">
-
+   
    {{template "views/partials/navbar" .}}
-
-    <div class="container mx-auto px-4 py-8 flex-grow">
-        <!-- Error Section -->
-        <div class="bg-gradient-to-r from-blue-900/30 to-indigo-900/30 rounded-2xl shadow-xl p-8 mb-10">
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="mb-6 text-6xl text-blue-400">
-                    <i class="fas fa-exclamation-circle"></i>
-                </div>
-                <h1 class="text-4xl md:text-5xl font-bold text-white mb-4">
-                    <span class="bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">
-                        404 - Page Not Found
-                    </span>
-                </h1>
-                <p class="text-xl text-gray-300 mb-6">The page you're looking for doesn't exist or has been moved</p>
-                <div class="flex flex-wrap justify-center gap-4">
-                    <a href="./" 
-                       class="group flex items-center bg-blue-600 hover:bg-blue-700 text-white py-2 px-6 rounded-lg transition duration-300 ease-in-out transform hover:scale-105 hover:shadow-lg">
-                        <i class="fas fa-home mr-2"></i>
-                        <span>Return Home</span>
-                        <i class="fas fa-arrow-right opacity-0 group-hover:opacity-100 group-hover:translate-x-2 ml-2 transition-all duration-300"></i>
-                    </a>
-                    <a href="browse" 
-                       class="group flex items-center bg-indigo-600 hover:bg-indigo-700 text-white py-2 px-6 rounded-lg transition duration-300 ease-in-out transform hover:scale-105 hover:shadow-lg">
-                        <i class="fas fa-images mr-2"></i>
-                        <span>Browse Gallery</span>
-                        <i class="fas fa-arrow-right opacity-0 group-hover:opacity-100 group-hover:translate-x-2 ml-2 transition-all duration-300"></i>
-                    </a>
-                </div>
+    
+    <div class="container mx-auto px-4 flex-grow">
+        <div class="header text-center py-12">
+            <h1 class="text-5xl font-bold">Welcome to your LocalAI instance!</h1>
+            <div class="mt-6">
+         <!--       <a href="./" aria-label="HomePage" alt="HomePage">
+                    <img class="mx-auto w-1/4 h-auto" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo">            
+                </a>
+            -->
            </div>
+            <p class="mt-4 text-lg">The FOSS alternative to OpenAI, Claude, ...</p>
+            <a href="https://localai.io" target="_blank" class="mt-4 inline-block bg-blue-500 text-white py-2 px-4 rounded transition duration-300 ease-in-out hover:bg-blue-700"><i class="fas fa-book-reader pr-2"></i>Documentation</a>
        </div>

-        <!-- Additional Information -->
-        <div class="bg-gray-800/50 border border-gray-700/50 rounded-xl p-8 shadow-md backdrop-blur-sm">
-            <div class="text-center max-w-3xl mx-auto">
-                <div class="inline-flex items-center justify-center w-16 h-16 rounded-full bg-yellow-500/20 mb-4">
-                    <i class="text-yellow-400 text-2xl fa-solid fa-triangle-exclamation"></i>
-                </div>
-                <h2 class="text-2xl md:text-3xl font-semibold text-gray-100 mb-4">Looking for resources?</h2>
-                <p class="text-lg text-gray-300 mb-6">Visit our <a class="text-blue-400 hover:text-blue-300 underline underline-offset-2" href="browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-blue-400 hover:text-blue-300 underline underline-offset-2"> <i class="fa-solid fa-book"></i> Getting started documentation</a></p>
-            </div>
+        <div class="models mt-12">
+            <h2 class="text-center text-3xl font-semibold">Nothing found!</h2>
        </div>
    </div>

@@ -53,4 +30,4 @@
 </div>

 </body>
-</html>
+</html>
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -31,7 +31,7 @@ SOFTWARE.
  <script defer src="static/chat.js"></script>
  {{ $allGalleryConfigs:=.GalleryConfig }}
  {{ $model:=.Model}}
-  <body class="bg-slate-900 text-gray-100 flex flex-col h-screen" x-data="{ sidebarOpen: true }">
+  <body class="bg-slate-900 text-gray-100 flex flex-col h-screen" x-data="{ key: $store.chat.key, sidebarOpen: true }">
    {{template "views/partials/navbar" .}}

    <!-- Main container with sidebar toggle -->
@@ -150,9 +150,36 @@ SOFTWARE.
            </div>

            <!-- Settings tab -->
-            <div x-show="activeTab === 'settings'" x-data="{ showPromptForm: false }" class="space-y-3">           
+            <div x-show="activeTab === 'settings'" x-data="{ showKeyForm: false, showPromptForm: false }" class="space-y-3">
              <button 
-                @click="showPromptForm = !showPromptForm" 
+                @click="showKeyForm = !showKeyForm; showPromptForm = false" 
+                class="w-full flex items-center justify-between px-3 py-2 text-sm rounded text-white bg-gray-700 hover:bg-gray-600 transition-colors"
+              >
+                <span><i class="fa-solid fa-key mr-2"></i> API Key</span>
+                <i :class="showKeyForm ? 'fa-chevron-up' : 'fa-chevron-down'" class="fa-solid"></i>
+              </button>
+              
+              <div x-show="showKeyForm" class="p-3 bg-gray-700 rounded">
+                <form id="key" class="flex flex-col space-y-2">
+                  <input
+                    type="password"
+                    id="apiKey"
+                    name="apiKey"
+                    class="bg-gray-800 text-white border border-gray-600 focus:border-blue-500 focus:ring focus:ring-blue-500 focus:ring-opacity-50 rounded-md shadow-sm p-2 appearance-none"
+                    placeholder="OpenAI API Key"
+                    x-model.lazy="key"
+                  />
+                  <button
+                    type="submit"
+                    class="px-3 py-2 text-sm rounded text-white bg-blue-600 hover:bg-blue-700 transition-colors"
+                  >
+                    Save API Key
+                  </button>
+                </form>
+              </div>
+
+              <button 
+                @click="showPromptForm = !showPromptForm; showKeyForm = false" 
                class="w-full flex items-center justify-between px-3 py-2 text-sm rounded text-white bg-gray-700 hover:bg-gray-600 transition-colors"
              >
                <span><i class="fa-solid fa-message mr-2"></i> System Prompt</span>
--- a/core/http/views/explorer.html
+++ b/core/http/views/explorer.html
@@ -1,224 +1,380 @@
 <!DOCTYPE html>
 <html lang="en">
+
 {{template "views/partials/head" .}}

-<body class="bg-gradient-to-br from-gray-900 to-gray-950 text-gray-200">
-<div class="flex flex-col min-h-screen">
+<style>
+    body {
+        background-color: #1a202c;
+        color: #e2e8f0;
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    }
+    .token {
+        word-break: break-all;
+    }
+    .container {
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 20px;
+        position: relative;
+    }
+    .network-card {
+        background-color: #2d3748;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        transition: transform 0.3s ease, box-shadow 0.3s ease;
+    }
+    .network-card:hover {
+        transform: translateY(-5px);
+        box-shadow: 0 6px 10px rgba(0, 0, 0, 0.15);
+    }
+    .network-title {
+        font-size: 24px;
+        font-weight: bold;
+        margin-bottom: 10px;
+        color: #63b3ed;
+    }
+    .network-token {
+        font-size: 14px;
+        font-style: italic;
+        color: #cbd5e0;
+        margin-bottom: 10px;
+        word-break: break-word; /* Breaks words to prevent overflow */
+        overflow-wrap: break-word; /* Ensures long strings break */
+        white-space: pre-wrap; /* Preserves whitespace for breaking */
+    }
+    .cluster {
+        margin-top: 10px;
+        background-color: #4a5568;
+        padding: 10px;
+        border-radius: 6px;
+        transition: background-color 0.3s ease;
+    }
+    .cluster:hover {
+        background-color: #5a6b78;
+    }
+    .cluster-title {
+        font-size: 18px;
+        font-weight: bold;
+        color: #e2e8f0;
+    }
+    .form-container {
+        background-color: #2d3748;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .form-control {
+        margin-bottom: 15px;
+    }
+    label {
+        display: block;
+        margin-bottom: 5px;
+        font-weight: bold;
+    }
+    input[type="text"],
+    textarea {
+        width: 100%;
+        padding: 10px;
+        border-radius: 4px;
+        border: 1px solid #4a5568;
+        background-color: #3a4250;
+        color: #e2e8f0;
+        transition: border-color 0.3s ease, background-color 0.3s ease;
+    }
+    input[type="text"]:focus,
+    textarea:focus {
+        border-color: #63b3ed;
+        background-color: #4a5568;
+    }
+    button {
+        background-color: #3182ce;
+        color: #e2e8f0;
+        padding: 10px 20px;
+        border: none;
+        border-radius: 4px;
+        cursor: pointer;
+        transition: background-color 0.3s ease;
+    }
+    .error {
+        color: #e53e3e;
+        margin-top: 5px;
+    }
+    .success {
+        color: #38a169;
+        margin-top: 5px;
+    }
+    /* Spinner Styles */
+    .spinner {
+        display: inline-block;
+        width: 50px;
+        height: 50px;
+        border: 5px solid rgba(255, 255, 255, 0.2);
+        border-radius: 50%;
+        border-top-color: #3182ce;
+        animation: spin 1s linear infinite;
+        margin: 0 auto;
+    }

-    <!-- Simple navigation for login page -->
-    <nav class="bg-gray-900/80 border-b border-gray-800/60 backdrop-blur-sm">
-        <div class="container mx-auto px-4 py-4 flex justify-between items-center">
-            <div class="flex items-center">
-                <i class="fas fa-network-wired text-blue-400 text-2xl mr-3"></i>
-                <h1 class="text-xl font-bold text-white">LocalAI</h1>
+    @keyframes spin {
+        to { transform: rotate(360deg); }
+    }
+
+    /* Center the loading text and spinner */
+    .loading-container {
+        text-align: center;
+        padding: 50px;
+    }
+    .warning-box {
+            border-radius: 5px;
+    }
+    .warning-box i {
+        margin-right: 10px;
+    }
+    .token-box {
+            background-color: #4a5568;
+            padding: 10px;
+            border-radius: 4px;
+            margin-top: 10px;
+            position: relative;
+            cursor: pointer;
+        }
+        .token-box:hover {
+            background-color: #5a6b7e;
+        }
+        .token-text {
+            overflow-wrap: break-word;
+            font-family: monospace;
+        }
+        .copy-icon {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            color: #e2e8f0;
+        }
+</style>
+
+<body class="bg-gray-900 text-gray-200">
+    <div class="flex flex-col min-h-screen" x-data="networkClusters()" x-init="init()">
+        {{template "views/partials/navbar_explorer" .}}
+        <div class="animation-container">
+            <canvas id="networkCanvas"></canvas>
+            <div class="text-overlay">
+                <header class="text-center py-12">
+                    <h1 class="text-5xl font-bold text-gray-100">
+                        <i class="fa-solid fa-circle-nodes mr-2"></i> Network Clusters Explorer   
+        
+                    </h1>
+                    <p class="mt-4 text-lg">
+                        View the clusters and workers available in each network.
+                        <a href="https://localai.io/features/distribute/" target="_blank">
+                            <i class="fas fa-circle-info pr-2"></i>
+                        </a>
+                    </p>
+        
+                </header>
            </div>
        </div>
-    </nav>

-    <div class="container mx-auto px-4 py-8 flex-grow flex items-center justify-center">
-        <!-- Auth Card -->
-        <div class="max-w-md w-full bg-gray-800/90 border border-gray-700/50 rounded-xl overflow-hidden shadow-xl">
-            <div class="animation-container">
-                <div class="text-overlay">
-                    <i class="fas fa-circle-nodes text-5xl text-blue-400 mb-2"></i>
+        <div class="container mx-auto px-4 flex-grow">
+        <!-- Warning Box -->
+        <div class="warning-box bg-yellow-100 text-gray-800 mb-20 pt-5 pb-5 pr-5 pl-5 text-lg">
+            <i class="fa-solid fa-triangle-exclamation"></i><i class="fa-solid fa-flask"></i>
+            The explorer is a global, community-driven tool to share network tokens and view available clusters in the globe.
+            Anyone can use the tokens to offload computation and use the clusters available or share resources.
+            This is provided without any warranty. Use it at your own risk. We are not responsible for any potential harm or misuse. Sharing tokens globally allows anyone from the internet to use your instances. 
+            Although the community will address bugs, this is experimental software and may be insecure to deploy on your hardware unless you take all necessary precautions.
+        </div>
+            <div class="flow-root">
+            <!-- Toggle button for showing/hiding the form -->
+            <button class="bg-red-600 hover:bg-blue-600 float-right mb-2 flex items-center px-4 py-2 rounded" @click="toggleForm()">
+                <!-- Conditional icon display -->
+                <i :class="showForm ? 'fa-solid fa-times' : 'fa-solid fa-plus'" class="mr-2"></i>
+                <span x-text="showForm ? 'Close' : 'Add New Network'"></span>
+            </button>
+        </div>
+            <!-- Form for adding a new network -->
+            <div class="form-container" x-show="showForm" @click.outside="showForm = false">
+                <h2 class="text-3xl font-bold mb-4"><i class="fa-solid fa-plus"></i> Add New Network</h2>
+                <div class="form-control">
+                    <label for="name">Network Name</label>
+                    <input type="text" id="name" x-model="newNetwork.name" placeholder="Enter network name" />
                </div>
+                <div class="form-control">
+                    <label for="description">Description</label>
+                    <textarea id="description" x-model="newNetwork.description" placeholder="Enter description"></textarea>
+                </div>
+                <div class="form-control">
+                    <label for="token">Token</label>
+                    <textarea id="token" x-model="newNetwork.token" placeholder="Enter token"></textarea>
+                </div>
+                <button @click="addNetwork"><i class="fa-solid fa-plus"></i> Add Network</button>
+                <template x-if="errorMessage">
+                    <p class="error" x-text="errorMessage"></p>
+                </template>
+                <template x-if="successMessage">
+                    <p class="success" x-text="successMessage"></p>
+                </template>
            </div>
-            
-            <div class="p-8">
-                <div class="text-center mb-6">
-                    <h2 class="text-2xl font-bold text-white">
-                        <span class="bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">
-                            Authorization Required
-                        </span>
-                    </h2>
-                    <p class="text-gray-400 mt-2">Please enter your access token to continue</p>
+
+            <!-- Loading Spinner -->
+            <template x-if="networks.length === 0 && !loadingComplete">
+                <div class="loading-container">
+                    <div class="spinner"></div>
+                    <p class="text-center mt-4">Loading networks...</p>
                </div>
-                
-                <form id="login-form" class="space-y-6" onsubmit="login(); return false;">
-                    <div>
-                        <label for="token" class="block text-sm font-medium text-gray-300 mb-2">Access Token</label>
-                        <div class="relative">
-                            <div class="absolute inset-y-0 left-0 pl-3 flex items-center pointer-events-none">
-                                <i class="fas fa-key text-gray-500"></i>
-                            </div>
-                            <input 
-                                type="password" 
-                                id="token" 
-                                name="token" 
-                                placeholder="Enter your token" 
-                                class="bg-gray-700/50 border border-gray-600 text-white placeholder-gray-400 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full pl-10 p-2.5"
-                                required
-                            />
+            </template>
+
+            <template x-if="networks.length === 0 && loadingComplete">
+                <div class="loading-container">
+                    <p class="text-center mt-4">No networks available with online workers</p>
+                </div>
+            </template>
+
+            <!-- Display Networks -->
+            <template x-for="network in networks" :key="network.name">
+                <div class="network-card">
+                    <i class="fa-solid fa-circle-nodes mr-2"></i><span class="network-title font-bold mb-4 mt-1" x-text="network.name"></span>
+                    <div class="token-box" @click="copyToken(network.token)">
+                        <p class="text-lg font-bold mb-4 mt-1">
+                            <i class="fa-solid fa-copy copy-icon"></i>
+                            <i class="fa-solid fa-key mr-2"></i>Token (click to copy): 
+                        </p>
+                        <span class="token-text" x-text="network.token"></span>
+                    </div>
+
+                    <div class="cluster">
+                        <p class="text-lg font-bold mb-4 mt-1"><i class="fa-solid fa-book mr-2"></i> Description</p>
+                        <p x-text="network.description"></p>
+                    </div>
+                    <h2 class="text-3xl font-bold mb-4 mt-4">Available Clusters in this network</h2>
+                    <template x-for="cluster in network.Clusters" :key="cluster.NetworkID + cluster.Type">
+                        <div class="cluster">
+                            <div class="cluster-title"></div>
+                            <span class="inline-block bg-orange-500 text-white py-1 px-3 rounded-full text-xs"  x-text="'Cluster Type: ' + cluster.Type">
+                            </span>
+
+                            <span class="inline-block bg-orange-500 text-white py-1 px-3 rounded-full text-xs" x-show="cluster.NetworkID" x-text="'Network ID: ' + (cluster.NetworkID || 'N/A')">
+                            </span>
+                            <span class="inline-block bg-blue-500 text-white py-1 px-3 rounded-full text-xs"  x-text="'Number of Workers: ' + cluster.Workers.length">
+                            </span>
+                            <!-- Give commands and instructions to join the network -->
+                            <span class="inline-block token-box text-white py-1 px-3 text-xs" x-show="cluster.Type == 'federated'" >
+                                <p class="text-lg font-bold mb-4 mt-1">
+                                    <i class="fa-solid fa-copy copy-icon float-right"></i>
+                                    Command to connect (click to copy): 
+                                </p>
+                                <code class="block bg-gray-700 text-yellow-300 p-4 rounded-lg break-words"  @click="copyToken($el.textContent)" >
+                                    docker run -d --restart=always -e ADDRESS=":80" -e LOCALAI_P2P_NETWORK_ID=<span class="token" x-text="cluster.NetworkID"></span> -e LOCALAI_P2P_LOGLEVEL=debug --name local-ai -e TOKEN="<span class="token" x-text="network.token"></span>" --net host -ti localai/localai:master-ffmpeg-core federated --debug
+                                </code>
+                                or via CLI:
+                                <code class="block bg-gray-700 text-yellow-300 p-4 rounded-lg break-words"  @click="copyToken($el.textContent)" >
+                                   ADDRESS=":80" LOCALAI_P2P_NETWORK_ID=<span class="token" x-text="cluster.NetworkID"></span> LOCALAI_P2P_LOGLEVEL=debug TOKEN="<span class="token" x-text="network.token"></span>" local-ai federated --debug
+                                </code>
+                            </span>
                        </div>
-                    </div>
-                    
-                    <div>
-                        <button 
-                            type="submit" 
-                            class="group w-full flex items-center justify-center bg-gradient-to-r from-blue-600 to-indigo-600 hover:from-blue-700 hover:to-indigo-700 text-white py-3 px-6 rounded-lg transition duration-300 ease-in-out transform hover:scale-[1.02] hover:shadow-lg font-medium"
-                        >
-                            <i class="fas fa-sign-in-alt mr-2"></i>
-                            <span>Login</span>
-                            <i class="fas fa-arrow-right opacity-0 group-hover:opacity-100 group-hover:translate-x-2 ml-2 transition-all duration-300"></i>
-                        </button>
-                    </div>
-                </form>
-                
-                <div class="mt-8 pt-6 border-t border-gray-700/50 text-center text-sm text-gray-400">
-                    <div class="flex items-center justify-center mb-2">
-                        <i class="fas fa-shield-alt mr-2 text-blue-400"></i>
-                        <span>Secure connection</span>
-                    </div>
-                    <p>Current time (UTC): <span id="current-time">{{.CurrentDate}}</span></p>
+                    </template>
                </div>
-            </div>
+            </template>
        </div>
-    </div>
+        <script>
+            function networkClusters() {
+                return {
+                    networks: [],
+                    newNetwork: {
+                        name: '',
+                        description: '',
+                        token: ''
+                    },
+                    errorMessage: '',
+                    successMessage: '',
+                    showForm: false, // Form visibility state
+                    loadingComplete: false, // To track if loading is complete
+                    toggleForm() {
+                        this.showForm = !this.showForm;
+                        console.log('Toggling form:', this.showForm);
+                    },
+                    fetchNetworks() {
+                        console.log('Fetching networks...');
+                        fetch('/networks')
+                            .then(response => response.json())
+                            .then(data => {
+                                console.log('Data fetched successfully:', data);
+                                this.networks = data;
+                                this.loadingComplete = true; // Set loading complete
+                            })
+                            .catch(error => {
+                                console.error('Error fetching networks:', error);
+                                this.loadingComplete = true; // Ensure spinner is hidden if error occurs
+                            });
+                    },

-    {{template "views/partials/footer" .}}
-</div>
+                    addNetwork() {
+                        this.errorMessage = '';
+                        this.successMessage = '';
+                        console.log('Adding new network:', this.newNetwork);

-<script>
-    function login() {
-        const token = document.getElementById('token').value;
-        if (!token.trim()) {
-            // Show error with fading effect
-            const form = document.getElementById('login-form');
-            const errorMsg = document.createElement('div');
-            errorMsg.className = 'p-3 mt-4 bg-red-900/50 text-red-200 rounded-lg border border-red-700/50 text-sm flex items-center';
-            errorMsg.innerHTML = '<i class="fas fa-exclamation-circle mr-2"></i> Please enter a valid token';
-            
-            // Remove any existing error message
-            const existingError = form.querySelector('.bg-red-900/50');
-            if (existingError) form.removeChild(existingError);
-            
-            // Add new error message with animation
-            form.appendChild(errorMsg);
-            setTimeout(() => {
-                errorMsg.style.opacity = '0';
-                errorMsg.style.transition = 'opacity 0.5s ease';
-                setTimeout(() => errorMsg.remove(), 500);
-            }, 3000);
-            return;
-        }
-        
-        var date = new Date();
-        date.setTime(date.getTime() + (24*60*60*1000));
-        document.cookie = `token=${token}; expires=${date.toGMTString()}; path=/`;
-
-        // Show loading state
-        const button = document.querySelector('button[type="submit"]');
-        const originalContent = button.innerHTML;
-        button.disabled = true;
-        button.innerHTML = '<i class="fas fa-spinner fa-spin mr-2"></i> Authenticating...';
-        button.classList.add('bg-gray-600');
-        
-        // Reload after short delay to show loading state
-        setTimeout(() => {
-            window.location.reload();
-        }, 800);
-    }
-    
-    // Update current time
-    function updateCurrentTime() {
-        const timeElement = document.getElementById('current-time');
-        if (timeElement) {
-            const now = new Date();
-            const year = now.getUTCFullYear();
-            const month = String(now.getUTCMonth() + 1).padStart(2, '0');
-            const day = String(now.getUTCDate()).padStart(2, '0');
-            const hours = String(now.getUTCHours()).padStart(2, '0');
-            const minutes = String(now.getUTCMinutes()).padStart(2, '0');
-            const seconds = String(now.getUTCSeconds()).padStart(2, '0');
-            timeElement.textContent = `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
-        }
-    }
-    
-    // Initialize current time and update it every second
-    updateCurrentTime();
-    setInterval(updateCurrentTime, 1000);
-    
-    // Add subtle particle animation to the background
-    document.addEventListener('DOMContentLoaded', function() {
-        const animContainer = document.querySelector('.animation-container');
-        if (animContainer) {
-            const canvas = document.createElement('canvas');
-            animContainer.appendChild(canvas);
-            
-            const ctx = canvas.getContext('2d');
-            canvas.width = animContainer.offsetWidth;
-            canvas.height = animContainer.offsetHeight;
-            
-            // Create particles
-            const particles = [];
-            const particleCount = 30;
-            
-            for (let i = 0; i < particleCount; i++) {
-                particles.push({
-                    x: Math.random() * canvas.width,
-                    y: Math.random() * canvas.height,
-                    radius: Math.random() * 3 + 1,
-                    color: `rgba(${Math.random() * 50 + 50}, ${Math.random() * 100 + 100}, ${Math.random() * 155 + 100}, ${Math.random() * 0.4 + 0.1})`,
-                    speedX: Math.random() * 0.5 - 0.25,
-                    speedY: Math.random() * 0.5 - 0.25
-                });
-            }
-            
-            // Animation loop
-            function animate() {
-                requestAnimationFrame(animate);
-                ctx.clearRect(0, 0, canvas.width, canvas.height);
-                
-                particles.forEach(particle => {
-                    particle.x += particle.speedX;
-                    particle.y += particle.speedY;
-                    
-                    // Bounce off edges
-                    if (particle.x < 0 || particle.x > canvas.width) {
-                        particle.speedX = -particle.speedX;
-                    }
-                    
-                    if (particle.y < 0 || particle.y > canvas.height) {
-                        particle.speedY = -particle.speedY;
-                    }
-                    
-                    // Draw particle
-                    ctx.beginPath();
-                    ctx.arc(particle.x, particle.y, particle.radius, 0, Math.PI * 2);
-                    ctx.fillStyle = particle.color;
-                    ctx.fill();
-                });
-                
-                // Connect nearby particles with lines
-                for (let i = 0; i < particles.length; i++) {
-                    for (let j = i + 1; j < particles.length; j++) {
-                        const dx = particles[i].x - particles[j].x;
-                        const dy = particles[i].y - particles[j].y;
-                        const distance = Math.sqrt(dx * dx + dy * dy);
-                        
-                        if (distance < 100) {
-                            ctx.beginPath();
-                            ctx.moveTo(particles[i].x, particles[i].y);
-                            ctx.lineTo(particles[j].x, particles[j].y);
-                            ctx.strokeStyle = `rgba(100, 150, 255, ${0.1 * (1 - distance / 100)})`;
-                            ctx.lineWidth = 1;
-                            ctx.stroke();
+                        // Validate input
+                        if (!this.newNetwork.name || !this.newNetwork.description || !this.newNetwork.token) {
+                            this.errorMessage = 'All fields are required.';
+                            return;
                        }
+
+                        fetch('/network/add', {
+                            method: 'POST',
+                            headers: {
+                                'Content-Type': 'application/json'
+                            },
+                            body: JSON.stringify(this.newNetwork)
+                        })
+                            .then(response => {
+                                if (!response.ok) {
+                                    return response.json().then(err => { throw err; });
+                                }
+                                return response.json();
+                            })
+                            .then(data => {
+                                console.log('Network added successfully:', data);
+                                this.successMessage = 'Network added successfully!';
+                                this.fetchNetworks(); // Refresh the networks list
+                                this.newNetwork = { name: '', description: '', token: '' }; // Clear form
+                            })
+                            .catch(error => {
+                                console.error('Error adding network:', error);
+                                this.errorMessage = 'Failed to add network. Please try again.'
+                                if (error.error) {
+                                    this.errorMessage += " Error : " + error.error;
+                                }
+                            });
+                    },
+                    copyToken(token) {
+                        navigator.clipboard.writeText(token)
+                        .then(() => {
+                            console.log('Text copied to clipboard:', token);
+                            alert('Text copied to clipboard!');
+                        })
+                        .catch(err => {
+                            console.error('Failed to copy token:', err);
+                        });
+                    },
+                    init() {
+                        console.log('Initializing Alpine component...');
+                        this.fetchNetworks();
+                        setInterval(() => {
+                            this.fetchNetworks();
+                        }, 5000); // Refresh every 5 seconds
                    }
                }
            }
-            
-            // Start animation
-            animate();
-            
-            // Resize handling
-            window.addEventListener('resize', () => {
-                canvas.width = animContainer.offsetWidth;
-                canvas.height = animContainer.offsetHeight;
-            });
-        }
-    });
-</script>
+        </script>
+        <script src="static/p2panimation.js"></script>
+
+        {{template "views/partials/footer" .}}
+    </div>

 </body>
-</html>
+
+</html>
--- a/core/http/views/login.html
+++ b/core/http/views/login.html
@@ -1,216 +1,25 @@
 <!DOCTYPE html>
 <html lang="en">
-{{template "views/partials/head" .}}
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Open Authenticated Website</title>
+    <base href="{{.BaseURL}}" />
+    <link rel="icon" type="image/x-icon" href="favicon.ico" />
+</head>
+<body>
+    <h1>Authorization is required</h1>
+    <input type="text" id="token" placeholder="Token" />
+    <button onclick="login()">Login</button>
+    <script>
+        function login() {
+            const token = document.getElementById('token').value;
+            var date = new Date();
+            date.setTime(date.getTime() + (24*60*60*1000));
+            document.cookie = `token=${token}; expires=${date.toGMTString()}`;

-<body class="bg-gradient-to-br from-gray-900 to-gray-950 text-gray-200">
-<div class="flex flex-col min-h-screen">
-
-    {{template "views/partials/navbar" .}}
-
-    <div class="container mx-auto px-4 py-8 flex-grow flex items-center justify-center">
-        <!-- Auth Card -->
-        <div class="max-w-md w-full bg-gray-800/90 border border-gray-700/50 rounded-xl overflow-hidden shadow-xl">
-            <div class="animation-container">
-                <div class="text-overlay">
-                    <img src="static/logo.png" alt="LocalAI Logo" class="h-32">
-                </div>
-            </div>
-            
-            <div class="p-8">
-                <div class="text-center mb-6">
-                    <h2 class="text-2xl font-bold text-white">
-                        <span class="bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">
-                            Authorization Required
-                        </span>
-                    </h2>
-                    <p class="text-gray-400 mt-2">Please enter your access token to continue</p>
-                </div>
-                
-                <form id="login-form" class="space-y-6" onsubmit="login(); return false;">
-                    <div>
-                        <label for="token" class="block text-sm font-medium text-gray-300 mb-2">Access Token</label>
-                        <div class="relative">
-                            <div class="absolute inset-y-0 left-0 pl-3 flex items-center pointer-events-none">
-                                <i class="fas fa-key text-gray-500"></i>
-                            </div>
-                            <input 
-                                type="password" 
-                                id="token" 
-                                name="token" 
-                                placeholder="Enter your token" 
-                                class="bg-gray-700/50 border border-gray-600 text-white placeholder-gray-400 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full pl-10 p-2.5"
-                                required
-                            />
-                        </div>
-                    </div>
-                    
-                    <div>
-                        <button 
-                            type="submit" 
-                            class="group w-full flex items-center justify-center bg-gradient-to-r from-blue-600 to-indigo-600 hover:from-blue-700 hover:to-indigo-700 text-white py-3 px-6 rounded-lg transition duration-300 ease-in-out transform hover:scale-[1.02] hover:shadow-lg font-medium"
-                        >
-                            <i class="fas fa-sign-in-alt mr-2"></i>
-                            <span>Login</span>
-                            <i class="fas fa-arrow-right opacity-0 group-hover:opacity-100 group-hover:translate-x-2 ml-2 transition-all duration-300"></i>
-                        </button>
-                    </div>
-                </form>
-                
-                <div class="mt-8 pt-6 border-t border-gray-700/50 text-center text-sm text-gray-400">
-                    <div class="flex items-center justify-center mb-2">
-                        <i class="fas fa-shield-alt mr-2 text-blue-400"></i>
-                        <span>Instance is token protected</span>
-                    </div>
-                    <p>Current time (UTC): <span id="current-time">{{.CurrentDate}}</span></p>
-                </div>
-            </div>
-        </div>
-    </div>
-
-    {{template "views/partials/footer" .}}
-</div>
-
-<script>
-    function login() {
-        const token = document.getElementById('token').value;
-        if (!token.trim()) {
-            // Show error with fading effect
-            const form = document.getElementById('login-form');
-            const errorMsg = document.createElement('div');
-            errorMsg.className = 'p-3 mt-4 bg-red-900/50 text-red-200 rounded-lg border border-red-700/50 text-sm flex items-center';
-            errorMsg.innerHTML = '<i class="fas fa-exclamation-circle mr-2"></i> Please enter a valid token';
-            
-            // Remove any existing error message
-            const existingError = form.querySelector('.bg-red-900/50');
-            if (existingError) form.removeChild(existingError);
-            
-            // Add new error message with animation
-            form.appendChild(errorMsg);
-            setTimeout(() => {
-                errorMsg.style.opacity = '0';
-                errorMsg.style.transition = 'opacity 0.5s ease';
-                setTimeout(() => errorMsg.remove(), 500);
-            }, 3000);
-            return;
-        }
-        
-        var date = new Date();
-        date.setTime(date.getTime() + (24*60*60*1000));
-        document.cookie = `token=${token}; expires=${date.toGMTString()}; path=/`;
-
-        // Show loading state
-        const button = document.querySelector('button[type="submit"]');
-        const originalContent = button.innerHTML;
-        button.disabled = true;
-        button.innerHTML = '<i class="fas fa-spinner fa-spin mr-2"></i> Authenticating...';
-        button.classList.add('bg-gray-600');
-        
-        // Reload after short delay to show loading state
-        setTimeout(() => {
            window.location.reload();
-        }, 800);
-    }
-    
-    // Update current time
-    function updateCurrentTime() {
-        const timeElement = document.getElementById('current-time');
-        if (timeElement) {
-            const now = new Date();
-            const year = now.getUTCFullYear();
-            const month = String(now.getUTCMonth() + 1).padStart(2, '0');
-            const day = String(now.getUTCDate()).padStart(2, '0');
-            const hours = String(now.getUTCHours()).padStart(2, '0');
-            const minutes = String(now.getUTCMinutes()).padStart(2, '0');
-            const seconds = String(now.getUTCSeconds()).padStart(2, '0');
-            timeElement.textContent = `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
        }
-    }
-    
-    // Initialize current time and update it every second
-    updateCurrentTime();
-    setInterval(updateCurrentTime, 1000);
-    
-    // Add subtle particle animation to the background
-    document.addEventListener('DOMContentLoaded', function() {
-        const animContainer = document.querySelector('.animation-container');
-        if (animContainer) {
-            const canvas = document.createElement('canvas');
-            animContainer.appendChild(canvas);
-            
-            const ctx = canvas.getContext('2d');
-            canvas.width = animContainer.offsetWidth;
-            canvas.height = animContainer.offsetHeight;
-            
-            // Create particles
-            const particles = [];
-            const particleCount = 30;
-            
-            for (let i = 0; i < particleCount; i++) {
-                particles.push({
-                    x: Math.random() * canvas.width,
-                    y: Math.random() * canvas.height,
-                    radius: Math.random() * 3 + 1,
-                    color: `rgba(${Math.random() * 50 + 50}, ${Math.random() * 100 + 100}, ${Math.random() * 155 + 100}, ${Math.random() * 0.4 + 0.1})`,
-                    speedX: Math.random() * 0.5 - 0.25,
-                    speedY: Math.random() * 0.5 - 0.25
-                });
-            }
-            
-            // Animation loop
-            function animate() {
-                requestAnimationFrame(animate);
-                ctx.clearRect(0, 0, canvas.width, canvas.height);
-                
-                particles.forEach(particle => {
-                    particle.x += particle.speedX;
-                    particle.y += particle.speedY;
-                    
-                    // Bounce off edges
-                    if (particle.x < 0 || particle.x > canvas.width) {
-                        particle.speedX = -particle.speedX;
-                    }
-                    
-                    if (particle.y < 0 || particle.y > canvas.height) {
-                        particle.speedY = -particle.speedY;
-                    }
-                    
-                    // Draw particle
-                    ctx.beginPath();
-                    ctx.arc(particle.x, particle.y, particle.radius, 0, Math.PI * 2);
-                    ctx.fillStyle = particle.color;
-                    ctx.fill();
-                });
-                
-                // Connect nearby particles with lines
-                for (let i = 0; i < particles.length; i++) {
-                    for (let j = i + 1; j < particles.length; j++) {
-                        const dx = particles[i].x - particles[j].x;
-                        const dy = particles[i].y - particles[j].y;
-                        const distance = Math.sqrt(dx * dx + dy * dy);
-                        
-                        if (distance < 100) {
-                            ctx.beginPath();
-                            ctx.moveTo(particles[i].x, particles[i].y);
-                            ctx.lineTo(particles[j].x, particles[j].y);
-                            ctx.strokeStyle = `rgba(100, 150, 255, ${0.1 * (1 - distance / 100)})`;
-                            ctx.lineWidth = 1;
-                            ctx.stroke();
-                        }
-                    }
-                }
-            }
-            
-            // Start animation
-            animate();
-            
-            // Resize handling
-            window.addEventListener('resize', () => {
-                canvas.width = animContainer.offsetWidth;
-                canvas.height = animContainer.offsetHeight;
-            });
-        }
-    });
-</script>
-
+    </script>
 </body>
-</html>
+</html>
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -27,8 +27,6 @@
            </div>
        </div>
        
-        {{template "views/partials/inprogress" .}}
-
        <!-- Search and Filter Section -->
        <div class="bg-gray-800/70 rounded-xl p-6 mb-8 shadow-lg border border-gray-700/50">
            <!-- Search Input -->
--- a/Show More
+++ b/Show More