wire to grpc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
wip reranking llama.cpp
2026-02-04 03:32:40 -05:00 · 2025-04-19 20:22:31 +02:00 · 2025-04-19 19:52:02 +02:00 · 2025-04-19 15:52:29 +02:00 · 2025-04-19 08:53:24 +02:00 · 2025-04-18 21:45:48 +00:00
121 changed files with 2571 additions and 724 deletions
--- a/.env
+++ b/.env
@@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/autogptq"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,6 +75,7 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-hipblas-core'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -251,6 +252,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f16-core'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -261,6 +263,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f32-core'

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -339,6 +342,7 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -351,17 +355,18 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
-            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-vulkan-core'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.3
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/5
+++ b/5
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -431,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/autogptq \
-    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/30
+++ b/30
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=4663bd353c61c1136cd8a97b9908755e4ab30cec
+CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -21,8 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -260,11 +260,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a

 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/stablediffusion-ggml
-endif
+	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml

 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -509,18 +505,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
-
-.PHONY: autogptq-protogen
-autogptq-protogen:
-	$(MAKE) -C backend/python/autogptq protogen
-
-.PHONY: autogptq-protogen-clean
-autogptq-protogen-clean:
-	$(MAKE) -C backend/python/autogptq protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@@ -597,7 +585,6 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
-	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@@ -809,7 +796,8 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--progress plain \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -817,7 +805,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
-    LocalAI
+  <img height="300" src="./core/http/static/logo.png"> <br>
 <br>
 </h1>

@@ -48,9 +47,58 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
+## 📚🆕 Local Stack Family
+
+🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
+
+<table>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalAGI">
+        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
+      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
+    </td>
+  </tr>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalRecall">
+        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
+      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
+    </td>
+  </tr>
+</table>
+
+## Screenshots
+
+
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
+
+| Chat Interface | Home |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
+
+## 💻 Quickstart

 Run the installer script:

@@ -59,17 +107,21 @@ curl https://localai.io/install.sh | sh
 ```

 Or run with docker:
+
+### CPU only image:
 ```bash
-# CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-
-# Nvidia GPU:
+```
+### Nvidia GPU:
+```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-
-# CPU and GPU image (bigger size):
+```
+### CPU and GPU image (bigger size):
+```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+```
+### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```

@@ -88,10 +140,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

-[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

+- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -105,19 +160,6 @@ local-ai run oci://localai/phi-2:latest

 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-## 🔥🔥 Hot topics (looking for help):
-
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
-
-If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
-
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -131,12 +173,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

 ### 🔗 Community and integrations

--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -190,11 +190,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

 llama.cpp:
@@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -217,6 +217,7 @@ struct llama_client_slot

    bool infill = false;
    bool embedding = false;
+    bool reranker = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -509,15 +510,15 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
+            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
                /* use_gpu */ has_gpu,
-                /*verbosity=*/ 1,
+                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
            });
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
                return false;
            }

@@ -531,10 +532,16 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERR("unable to load model: %s", params.model.path.c_str());
            return false;
        }

+        // Enable reranking if embeddings are enabled - moved after context initialization
+        if (params.embedding) {
+            params.reranking = true;
+            LOG_INFO("Reranking enabled (embeddings are enabled)", {});
+        }
+
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_model_n_embd(model);
@@ -1413,7 +1420,59 @@ struct llama_server_context
        queue_results.send(res);
    }

-    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
+    void send_rerank(llama_client_slot &slot, const llama_batch & batch)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = true;
+
+        float score = -1e6f; // Default score if we fail to get embeddings
+
+        if (!params.reranking)
+        {
+            LOG_WARNING("reranking disabled", {
+                {"params.reranking", params.reranking},
+            });
+        }
+        else if (ctx == nullptr)
+        {
+            LOG_ERR("context is null, cannot perform reranking");
+            res.error = true;
+        }
+        else
+        {
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                if (embd == NULL) {
+                    embd = llama_get_embeddings_ith(ctx, i);
+                }
+
+                if (embd == NULL) {
+                    LOG("failed to get embeddings");
+                    continue;
+                }
+
+                score = embd[0];
+            }
+        }
+
+        // Format result as JSON similar to the embedding function
+        res.result_json = json
+        {
+            {"score", score},
+            {"tokens", slot.num_prompt_tokens}
+        };
+        
+        queue_results.send(res);
+    }
+
+    void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
    {
        task_server task;
        task.id = task_id;
@@ -1421,6 +1480,7 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
+        task.reranking_mode = rerank;
        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;

@@ -1552,7 +1612,7 @@ struct llama_server_context
            subtask_data["prompt"] = subtask_data["prompt"][i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
        }
    }

@@ -1591,6 +1651,7 @@ struct llama_server_context

                slot->infill       = task.infill_mode;
                slot->embedding    = task.embedding_mode;
+                slot->reranker    = task.reranking_mode;
                slot->task_id      = task.id;
                slot->multitask_id = task.multitask_id;

@@ -2034,6 +2095,14 @@ struct llama_server_context
                    continue;
                }

+                if (slot.reranker)
+                {
+                    send_rerank(slot, batch_view);
+                    slot.release();
+                    slot.i_batch = -1;
+                    continue;
+                }
+
                completion_token_output result;
                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

@@ -2326,11 +2395,11 @@ static void params_parse(const backend::ModelOptions* request,
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

-    params.model = request->modelfile();
+    params.model.path = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2405,7 +2474,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2489,7 +2558,7 @@ public:
        json data = parse_options(true, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        while (true)
        {
            task_result result = llama.queue_results.recv(task_id);
@@ -2543,7 +2612,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
@@ -2580,7 +2649,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
        // get the result
        task_result result = llama.queue_results.recv(task_id);
        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
@@ -2612,6 +2681,46 @@ public:
        return grpc::Status::OK;
    }

+    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
+        // Create a JSON object with the query and documents
+        json data = {
+            {"prompt", request->query()},
+            {"documents", request->documents()},
+            {"top_n", request->top_n()}
+        };
+
+        // Generate a new task ID
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+
+        // Queue the task with reranking mode enabled
+        llama.request_completion(task_id, data, false, false, true, -1);
+
+        // Get the result
+        task_result result = llama.queue_results.recv(task_id);
+        llama.queue_results.remove_waiting_task_id(task_id);
+
+        if (!result.error && result.stop) {
+            // Set usage information
+            backend::Usage* usage = rerankResult->mutable_usage();
+            usage->set_total_tokens(result.result_json.value("tokens", 0));
+            usage->set_prompt_tokens(result.result_json.value("tokens", 0));
+
+            // Get the score from the result
+            float score = result.result_json.value("score", 0.0f);
+
+            // Create document results for each input document
+            for (int i = 0; i < request->documents_size(); i++) {
+                backend::DocumentResult* doc_result = rerankResult->add_results();
+                doc_result->set_index(i);
+                doc_result->set_text(request->documents(i));
+                doc_result->set_relevance_score(score);
+            }
+        }
+
+        return grpc::Status::OK;
+    }
+
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();

@@ -2644,7 +2753,9 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-
+  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -21,6 +21,7 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -61,6 +61,7 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
+    bool reranking_mode = false;
    int multitask_id = -1;
 };

--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,6 +8,13 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

+GOCMD?=go
+CGO_LDFLAGS?=
+# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
+CGO_LDFLAGS_SYCL=
+GO_TAGS?=
+LD_FLAGS?=
+
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

@@ -21,7 +28,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
@@ -36,16 +43,35 @@ else ifeq ($(OS),Darwin)
 	endif
 endif

-# ifeq ($(BUILD_TYPE),sycl_f16)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-# endif
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON \
+		-DGGML_SYCL_F16=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif

-# ifeq ($(BUILD_TYPE),sycl_f32)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-# endif
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif

 # warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -86,11 +112,24 @@ endif
 	$(MAKE) $(COMBINED_LIB)

 gosd.o:
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
+else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
+endif

 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

+stablediffusion-ggml:
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
+	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
+ifneq ($(UPX),)
+	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
+endif
+
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,17 +0,0 @@
-.PHONY: autogptq
-autogptq: protogen
-	bash install.sh
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the autogptq project
-
-```
-make autogptq
-```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-from concurrent import futures
-import argparse
-import signal
-import sys
-import os
-import time
-import base64
-
-import grpc
-import backend_pb2
-import backend_pb2_grpc
-
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import TextGenerationPipeline
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            device = "cuda:0"
-            if request.Device != "":
-                device = request.Device
-
-            # support loading local model files
-            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
-            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
-
-            # support model `Qwen/Qwen-VL-Chat-Int4`
-            if "qwen-vl" in request.Model.lower():
-                self.model_name = "Qwen-VL-Chat"
-                model = AutoModelForCausalLM.from_pretrained(model_path, 
-                    trust_remote_code=request.TrustRemoteCode,
-                    device_map="auto").eval()
-            else:
-                model = AutoGPTQForCausalLM.from_quantized(model_path,
-                    model_basename=request.ModelBaseName,
-                    use_safetensors=True,
-                    trust_remote_code=request.TrustRemoteCode,
-                    device=device,
-                    use_triton=request.UseTriton,
-                    quantize_config=None)
-            
-            self.model = model
-            self.tokenizer = tokenizer
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.0
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        top_p = 0.95
-        if request.TopP != 0.0:
-            top_p = request.TopP
-
-        
-        prompt_images = self.recompile_vl_prompt(request)
-        compiled_prompt = prompt_images[0]
-        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
-
-        # Implement Predict RPC
-        pipeline = TextGenerationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer,
-            max_new_tokens=tokens,
-            temperature=request.Temperature,
-            top_p=top_p,
-            repetition_penalty=penalty,
-            )
-        t = pipeline(compiled_prompt)[0]["generated_text"]
-        print(f"generated_text: {t}", file=sys.stderr)
-        
-        if compiled_prompt in t:
-            t = t.replace(compiled_prompt, "")
-        # house keeping. Remove the image files from /tmp folder
-        for img_path in prompt_images[1]:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-    def recompile_vl_prompt(self, request):
-        prompt = request.Prompt
-        image_paths = []
-
-        if "qwen-vl" in self.model_name.lower():
-            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
-            # Then, save the image file paths to an array "image_paths".
-            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
-            for i, img in enumerate(request.Images):
-                timestamp = str(int(time.time() * 1000))  # Generate timestamp
-                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
-                with open(img_path, "wb") as f:
-                    f.write(base64.b64decode(img))
-                image_paths.append(img_path)
-                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
-        else:
-            prompt = request.Prompt
-        return (prompt, image_paths)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
-if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
-fi
-
-installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
-optimum[openvino]
-setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +0,0 @@
-accelerate
-auto-gptq==0.7.1
-grpcio==1.71.0
-protobuf
-certifi
-transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -1,4 +0,0 @@
-#!/bin/bash
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc

 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -287,6 +287,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
+            elif request.PipelineType == "Lumina2Text2ImgPipeline":
+                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
+                    request.Model,
+                    torch_dtype=torch.bfloat16)
+                if request.LowVRAM:
+                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@@ -516,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -320,7 +320,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}

-	if options.LoadToMemory != nil {
+	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

-	if so.SingleBackend {
-		defOpts = append(defOpts, model.WithSingleActiveBackend())
-	}
-
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word:    t.Word,
+			Word: t.Word,
 		})

 	}
@@ -161,38 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:               c.MMProj,
-		FlashAttention:       c.FlashAttention,
-		CacheTypeKey:         c.CacheTypeK,
-		CacheTypeValue:       c.CacheTypeV,
-		NoKVOffload:          c.NoKVOffloading,
-		YarnExtFactor:        c.YarnExtFactor,
-		YarnAttnFactor:       c.YarnAttnFactor,
-		YarnBetaFast:         c.YarnBetaFast,
-		YarnBetaSlow:         c.YarnBetaSlow,
-		NGQA:                 c.NGQA,
-		RMSNormEps:           c.RMSNormEps,
-		MLock:                mmlock,
-		RopeFreqBase:         c.RopeFreqBase,
-		RopeScaling:          c.RopeScaling,
-		Type:                 c.ModelType,
-		RopeFreqScale:        c.RopeFreqScale,
-		NUMA:                 c.NUMA,
-		Embeddings:           embeddings,
-		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(nGPULayers),
-		MMap:                 mmap,
-		MainGPU:              c.MainGPU,
-		Threads:              int32(*c.Threads),
-		TensorSplit:          c.TensorSplit,
-		// AutoGPTQ
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
-		Device:           c.AutoGPTQ.Device,
-		UseTriton:        c.AutoGPTQ.Triton,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:              c.MMProj,
+		FlashAttention:      c.FlashAttention,
+		CacheTypeKey:        c.CacheTypeK,
+		CacheTypeValue:      c.CacheTypeV,
+		NoKVOffload:         c.NoKVOffloading,
+		YarnExtFactor:       c.YarnExtFactor,
+		YarnAttnFactor:      c.YarnAttnFactor,
+		YarnBetaFast:        c.YarnBetaFast,
+		YarnBetaSlow:        c.YarnBetaSlow,
+		NGQA:                c.NGQA,
+		RMSNormEps:          c.RMSNormEps,
+		MLock:               mmlock,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeScaling:         c.RopeScaling,
+		Type:                c.ModelType,
+		RopeFreqScale:       c.RopeFreqScale,
+		NUMA:                c.NUMA,
+		Embeddings:          embeddings,
+		LowVRAM:             lowVRAM,
+		NGPULayers:          int32(nGPULayers),
+		MMap:                mmap,
+		MainGPU:             c.MainGPU,
+		Threads:             int32(*c.Threads),
+		TensorSplit:         c.TensorSplit,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,10 +26,10 @@ func SoundGeneration(

 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,6 +20,7 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac

 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
-
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
+	defer loader.Close()

 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()

 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()
+
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -50,9 +50,6 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

-	// AutoGPTQ specifics
-	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
-
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@@ -176,14 +173,6 @@ type LimitMMPerPrompt struct {
 	LimitAudioPerPrompt int `yaml:"audio"`
 }

-// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
-type AutoGPTQ struct {
-	ModelBaseName    string `yaml:"model_base_name"`
-	Device           string `yaml:"device"`
-	Triton           bool   `yaml:"triton"`
-	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
-}
-
 // TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
 	// Chat is the template used in the chat completion endpoint
@@ -555,7 +544,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -142,9 +142,9 @@ func API(application *application.Application) (*fiber.App, error) {
 	httpFS := http.FS(embedDirStatic)

 	router.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.ico",
+		URL:        "/favicon.svg",
 		FileSystem: httpFS,
-		File:       "static/favicon.ico",
+		File:       "static/favicon.svg",
 	}))

 	router.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
+			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl max-h-full",
+				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
+					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
 				},
 				// header
 				elem.Div(
@@ -164,14 +164,13 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4",
+						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
-							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@@ -232,7 +231,6 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
-
 }

 func modelDescription(m *gallery.GalleryModel) elem.Node {
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath)
+	var ml = model.NewModelLoader(modelPath, false)

 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
 	httpFS := http.FS(embedDirStatic)

 	app.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.ico",
+		URL:        "/favicon.svg",
 		FileSystem: httpFS,
-		File:       "static/favicon.ico",
+		File:       "static/favicon.svg",
 	}))

 	app.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}

-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)

 	// Stores
-	sl := model.NewModelLoader("")
-	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))

 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
--- a/core/http/static/favicon.ico
+++ b/core/http/static/favicon.ico
--- a/core/http/static/favicon.svg
+++ b/core/http/static/favicon.svg
--- a/core/http/static/logo.png
+++ b/core/http/static/logo.png
--- a/core/http/static/logo_horizontal.png
+++ b/core/http/static/logo_horizontal.png
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -115,6 +115,7 @@ async function sendTextToChatGPT(text) {

    const response = await fetch('v1/chat/completions', {
        method: 'POST',
+        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
            model: getModel(),
            messages: conversationHistory
--- a/core/http/views/login.html
+++ b/core/http/views/login.html
@@ -12,7 +12,7 @@
        <div class="max-w-md w-full bg-gray-800/90 border border-gray-700/50 rounded-xl overflow-hidden shadow-xl">
            <div class="animation-container">
                <div class="text-overlay">
-                <!--    <i class="fas fa-circle-nodes text-5xl text-blue-400 mb-2"></i> -->
+                    <img src="static/logo.png" alt="LocalAI Logo" class="h-32">
                </div>
            </div>
            
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -3,7 +3,7 @@
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>{{.Title}}</title>
  <base href="{{.BaseURL}}" />
-  <link rel="icon" type="image/x-icon" href="favicon.ico" />
+  <link rel="shortcut icon" href="static/favicon.svg" type="image/svg">
  <link rel="stylesheet" href="static/assets/highlightjs.css" />
  <script defer src="static/assets/highlightjs.js"></script>
  <script defer src="static/assets/alpine.js"></script>
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -4,10 +4,9 @@
            <div class="flex items-center">
                <!-- Logo Image -->
                <a href="./" class="flex items-center group">
-                    <img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" 
+                    <img src="static/logo_horizontal.png" 
                         alt="LocalAI Logo" 
-                         class="h-10 mr-3 rounded-lg border border-blue-600/30 shadow-md transition-all duration-300 group-hover:shadow-blue-500/20 group-hover:border-blue-500/50">
-                    <span class="text-white text-xl font-bold bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">LocalAI</span>
+                         class="h-14 mr-3 brightness-110 transition-all duration-300 group-hover:brightness-125">
                </a>
            </div>
            
--- a/core/http/views/partials/navbar_explorer.html
+++ b/core/http/views/partials/navbar_explorer.html
@@ -4,10 +4,9 @@
            <div class="flex items-center">
                <!-- Logo Image -->
                <a href="./" class="flex items-center group">
-                    <img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" 
+                    <img src="static/logo_horizontal.png" 
                         alt="LocalAI Logo" 
                         class="h-10 mr-3 rounded-lg border border-blue-600/30 shadow-md transition-all duration-300 group-hover:shadow-blue-500/20 group-hover:border-blue-500/50">
-                    <span class="text-white text-xl font-bold bg-clip-text text-transparent bg-gradient-to-r from-blue-400 to-indigo-400">LocalAI</span>
                </a>
            </div>
            
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -202,7 +202,6 @@ type OpenAIRequest struct {

 	Backend string `json:"backend" yaml:"backend"`

-	// AutoGPTQ
 	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
 }

--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -41,8 +41,6 @@ type PredictionOptions struct {
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
 	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
-	// AutoGPTQ
-	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`

 	// Diffusers
 	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
--- a/docs/assets/images/imagen.png
+++ b/docs/assets/images/imagen.png
--- a/docs/assets/images/localai_screenshot.png
+++ b/docs/assets/images/localai_screenshot.png
--- a/docs/assets/images/logos/logo.png
+++ b/docs/assets/images/logos/logo.png
--- a/docs/assets/images/logos/logo.svg
+++ b/docs/assets/images/logos/logo.svg
--- a/docs/assets/images/screenshots/screenshot_chat.png
+++ b/docs/assets/images/screenshots/screenshot_chat.png
--- a/docs/assets/images/screenshots/screenshot_gallery.png
+++ b/docs/assets/images/screenshots/screenshot_gallery.png
--- a/docs/assets/images/screenshots/screenshot_home.png
+++ b/docs/assets/images/screenshots/screenshot_home.png
--- a/docs/assets/images/screenshots/screenshot_image.png
+++ b/docs/assets/images/screenshots/screenshot_image.png
--- a/docs/assets/images/screenshots/screenshot_login.png
+++ b/docs/assets/images/screenshots/screenshot_login.png
--- a/docs/assets/images/screenshots/screenshot_p2p.png
+++ b/docs/assets/images/screenshots/screenshot_p2p.png
--- a/docs/assets/images/screenshots/screenshot_talk.png
+++ b/docs/assets/images/screenshots/screenshot_talk.png
--- a/docs/assets/images/screenshots/screenshot_tts.png
+++ b/docs/assets/images/screenshots/screenshot_tts.png
--- a/docs/assets/jsconfig.json
+++ b/docs/assets/jsconfig.json
@@ -3,7 +3,7 @@
  "baseUrl": ".",
  "paths": {
   "*": [
-    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/gohugoio/hugo-mod-jslibs-dist/popperjs/v2@v2.21100.20000/package/dist/cjs/popper.js/*",
+    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/gohugoio/hugo-mod-jslibs-dist/popperjs/v2@v2.21100.20000/package/dist/cjs/*",
    "../../../../.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/twbs/bootstrap@v5.3.2+incompatible/js/*"
   ]
  }
--- a/docs/config.toml
+++ b/docs/config.toml
@@ -48,9 +48,9 @@ defaultContentLanguage = 'en'

    [params.docs] # Parameters for the /docs 'template'

-        logo = "https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"
-        logo_text = "LocalAI"
-        title           = "LocalAI documentation"           # default html title for documentation pages/sections
+        logo = "https://raw.githubusercontent.com/mudler/LocalAI/refs/heads/master/core/http/static/logo.png"
+        logo_text = ""
+        title           = "LocalAI"           # default html title for documentation pages/sections

        pathName        = "docs"                            # path name for documentation site | default "docs"

@@ -108,6 +108,7 @@ defaultContentLanguage = 'en'
        # indexName = "" # Index Name to perform search on (or set env variable HUGO_PARAM_DOCSEARCH_indexName)

    [params.analytics] # Parameters for Analytics (Google, Plausible)
+        # google = "G-XXXXXXXXXX" # Replace with your Google Analytics ID
        # plausibleURL    = "/docs/s" # (or set via env variable HUGO_PARAM_ANALYTICS_plausibleURL)
        # plausibleAPI    = "/docs/s" # optional - (or set via env variable HUGO_PARAM_ANALYTICS_plausibleAPI)
        # plausibleDomain = ""      # (or set via env variable HUGO_PARAM_ANALYTICS_plausibleDomain)
@@ -151,7 +152,7 @@ defaultContentLanguage = 'en'

 [languages]
  [languages.en]
-    title = "LocalAI documentation"
+    title = "LocalAI"
    languageName = "English"
    weight = 10
 #  [languages.fr]
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@@ -268,14 +268,6 @@ yarn_ext_factor: 0
 yarn_attn_factor: 0
 yarn_beta_fast: 0
 yarn_beta_slow: 0
-
-# AutoGPT-Q settings, for configurations specific to GPT models.
-autogptq:
-    model_base_name: "" # Base name of the model.
-    device: "" # Device to run the model on.
-    triton: false # Whether to use Triton Inference Server.
-    use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
-
 # configuration for diffusers model
 diffusers:
    cuda: false # Whether to use CUDA
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -147,7 +147,6 @@ The devices in the following list have been tested with `hipblas` images running
 | diffusers | yes | Radeon VII (gfx906) |
 | piper | yes | Radeon VII (gfx906) |
 | whisper | no | none |
-| autogptq | no | none |
 | bark | no | none |
 | coqui | no | none |
 | transformers | no | none |
--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -13,6 +13,8 @@ LocalAI supports two modes of distributed inferencing via p2p:
 - **Federated Mode**: Requests are shared between the cluster and routed to a single worker node in the network based on the load balancer's decision.
 - **Worker Mode** (aka "model sharding" or "splitting weights"): Requests are processed by all the workers which contributes to the final inference result (by sharing the model weights).

+A list of global instances shared by the community is available at [explorer.localai.io](https://explorer.localai.io).
+
 ## Usage

 Starting LocalAI with `--p2p` generates a shared token for connecting multiple instances: and that's all you need to create AI clusters, eliminating the need for intricate network setups. 
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@@ -74,49 +74,9 @@ curl http://localhost:8080/v1/models

 ## Backends

-### AutoGPTQ
-
-[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is an easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.
-
-#### Prerequisites
-
-This is an extra backend - in the container images is already available and there is nothing to do for the setup.
-
-If you are building LocalAI locally, you need to install [AutoGPTQ manually](https://github.com/PanQiWei/AutoGPTQ#quick-installation).
-
-
-#### Model setup
-
-The models are automatically downloaded from `huggingface` if not present the first time. It is possible to define models via `YAML` config file, or just by querying the endpoint with the `huggingface` repository model name. For example, create a `YAML` config file in `models/`:
-
-```
-name: orca
-backend: autogptq
-model_base_name: "orca_mini_v2_13b-GPTQ-4bit-128g.no-act.order"
-parameters:
-  model: "TheBloke/orca_mini_v2_13b-GPTQ"
-# ...
-```
-
-Test with:
-
-```bash
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{                                                                                                         
-   "model": "orca",
-   "messages": [{"role": "user", "content": "How are you?"}],
-   "temperature": 0.1
- }'
-```
 ### RWKV

-A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv).
-
-Note: rwkv models needs to specify the backend `rwkv` in the YAML config files and have an associated tokenizer along that needs to be provided with it:
-
-```
-36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
-36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
-```
+RWKV support is available through llama.cpp (see below)

 ### llama.cpp

--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -14,18 +14,47 @@ icon = "rocket_launch"

 If you are exposing LocalAI remotely, make sure you protect the API endpoints adequately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.

-To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
-
 {{% /alert %}}

-## Using the Bash Installer
+## Quickstart

-Install LocalAI easily using the bash installer with the following command:

-```sh
+### Using the Bash Installer
+```bash
 curl https://localai.io/install.sh | sh
 ```

+### Run with docker:
+```bash
+# CPU only image:
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
+
+# Nvidia GPU:
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
+
+# CPU and GPU image (bigger size):
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+
+# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+```
+
+### Load models:
+
+```bash
+# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
+local-ai run llama-3.2-1b-instruct:q4_k_m
+# Start LocalAI with the phi-2 model directly from huggingface
+local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
+# Install and run a model from the Ollama OCI registry
+local-ai run ollama://gemma:2b
+# Run a model from a configuration file
+local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
+# Install and run a model from a standard OCI registry (e.g., Docker Hub)
+local-ai run oci://localai/phi-2:latest
+```
+
+
 For a full list of options, refer to the [Installer Options]({{% relref "docs/advanced/installer" %}}) documentation.

 Binaries can also be [manually downloaded]({{% relref "docs/reference/binaries" %}}).
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@@ -1,4 +1,3 @@
-
 +++
 title = "Overview"
 weight = 1
@@ -7,162 +6,96 @@ description = "What is LocalAI?"
 tags = ["Beginners"]
 categories = [""]
 author = "Ettore Di Giacinto"
-# This allows to overwrite the landing page
-url = '/'
 icon = "info"
 +++

-<p align="center">
-<a href="https://localai.io"><img width=512 src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"></a>
-</p               >
+# Welcome to LocalAI

-<p align="center">
-<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
-<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
-</a>
-<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
-<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
-</a>
-<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
-<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
-</a>
-<a href='https://github.com/go-skynet/LocalAI/releases'>
-<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
-</a>
-</p>
+LocalAI is your complete AI stack for running AI models locally. It's designed to be simple, efficient, and accessible, providing a drop-in replacement for OpenAI's API while keeping your data private and secure.

-<p align="center">
-<a href="https://hub.docker.com/r/localai/localai" target="blank">
-<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
-</a>
-<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
-<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
-</a>
-</p>
+## Why LocalAI?

-<p align="center">
-<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-</p>
+In today's AI landscape, privacy, control, and flexibility are paramount. LocalAI addresses these needs by:

-<p align="center">
-<a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
-</a>
-<a href="https://discord.gg/uJAeKSAGDy" target="blank">
-<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
-</a>
-</p>
+- **Privacy First**: Your data never leaves your machine
+- **Complete Control**: Run models on your terms, with your hardware
+- **Open Source**: MIT licensed and community-driven
+- **Flexible Deployment**: From laptops to servers, with or without GPUs
+- **Extensible**: Add new models and features as needed

-> 💡 Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [💭Discord](https://discord.gg/uJAeKSAGDy)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
+## Core Components

+LocalAI is more than just a single tool - it's a complete ecosystem:

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+1. **[LocalAI Core](https://github.com/mudler/LocalAI)**
+   - OpenAI-compatible API
+   - Multiple model support (LLMs, image, audio)
+   - No GPU required
+   - Fast inference with native bindings
+   - [Github repository](https://github.com/mudler/LocalAI)

+2. **[LocalAGI](https://github.com/mudler/LocalAGI)**
+   - Autonomous AI agents
+   - No coding required
+   - WebUI and REST API support
+   - Extensible agent framework
+   - [Github repository](https://github.com/mudler/LocalAGI)

-## Start LocalAI
+3. **[LocalRecall](https://github.com/mudler/LocalRecall)**
+   - Semantic search
+   - Memory management
+   - Vector database
+   - Perfect for AI applications
+   - [Github repository](https://github.com/mudler/LocalRecall)

-Start the image with Docker to have a functional clone of OpenAI! 🚀:
+## Getting Started

-```bash
-docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
-# Do you have a Nvidia GPUs? Use this instead
-# CUDA 11
-# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-11
-# CUDA 12
-# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
-```
-
-Or just use the bash installer:
+The fastest way to get started is with our one-line installer:

 ```bash
 curl https://localai.io/install.sh | sh
 ```

-See the [💻 Quickstart](https://localai.io/basics/getting_started/) for all the options and way you can run LocalAI!
+Or use Docker for a quick start:

-## What is LocalAI?
+```bash
+docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
+```

-In a nutshell:
+For more detailed installation options and configurations, see our [Getting Started guide](/basics/getting_started/).

- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either
-  - Optional, GPU Acceleration is available. See also the [build section](https://localai.io/basics/build/index.html).
- Supports multiple models
- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses bindings for a faster inference and better performance.
+## Key Features

-LocalAI is focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!
+- **Text Generation**: Run various LLMs locally
+- **Image Generation**: Create images with stable diffusion
+- **Audio Processing**: Text-to-speech and speech-to-text
+- **Vision API**: Image understanding and analysis
+- **Embeddings**: Vector database support
+- **Functions**: OpenAI-compatible function calling
+- **P2P**: Distributed inference capabilities

-Note that this started just as a fun weekend project by [mudler](https://github.com/mudler) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+## Community and Support

-### 🚀 Features
+LocalAI is a community-driven project. You can:

- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 💾 [Stores](https://localai.io/stores)
- 📈 [Reranker](https://localai.io/features/reranker/)
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- Join our [Discord community](https://discord.gg/uJAeKSAGDy)
+- Check out our [GitHub repository](https://github.com/mudler/LocalAI)
+- Contribute to the project
+- Share your use cases and examples

-## Contribute and help
+## Next Steps

-To help the project you can:
+Ready to dive in? Here are some recommended next steps:

- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+1. [Install LocalAI](/basics/getting_started/)
+2. [Explore available models](https://models.localai.io)
+3. [Model compatibility](/model-compatibility/)
+4. [Try out examples](https://github.com/mudler/LocalAI-examples)
+5. [Join the community](https://discord.gg/uJAeKSAGDy)
+6. [Check the LocalAI Github repository](https://github.com/mudler/LocalAI)
+7. [Check the LocalAGI Github repository](https://github.com/mudler/LocalAGI)

- If you don't have technological skills you can still help improving documentation or [add examples](https://github.com/go-skynet/LocalAI/tree/master/examples) or share your user-stories with our community, any help and contribution is welcome!

-## 🌟 Star history
+## License

-[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
-
-## ❤️ Sponsors
-
-> Do you find LocalAI useful?
-
-Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
-
-A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
-
-<p align="center">
-  <a href="https://www.spectrocloud.com/" target="blank">
-    <img width=200 src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
-  </a>
-  <a href="https://www.premai.io/" target="blank">
-    <img  width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
-  </a>
-</p>
-
-## 📖 License
-
-LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).
-
-MIT - Author Ettore Di Giacinto
-
-## 🙇 Acknowledgements
-
-LocalAI couldn't have been built without the help of great software already available from the community. Thank you!
-
- [llama.cpp](https://github.com/ggerganov/llama.cpp)
- https://github.com/tatsu-lab/stanford_alpaca
- https://github.com/cornelk/llama-go for the initial ideas
- https://github.com/antimatter15/alpaca.cpp
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/ggerganov/whisper.cpp
- https://github.com/saharNooby/rwkv.cpp
- https://github.com/rhasspy/piper
-
-## 🤗 Contributors
-
-This is a community project, a special thanks to our contributors! 🤗
-<a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
-</a>
+LocalAI is MIT licensed, created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
--- a/docs/data/landing.yaml
+++ b/docs/data/landing.yaml
@@ -2,38 +2,212 @@

 # Hero
 hero:
-  enable: false
+  enable: true
  weight: 10
  template: hero

+  backgroundImage:
+    path: "images/templates/hero"
+    filename:
+      desktop: "gradient-desktop.webp"
+      mobile: "gradient-mobile.webp"
+
+  badge:
+    text: "⭐ 31.8k+ stars on GitHub!"
+    color: primary
+    pill: false
+    soft: true
+
+  titleLogo:
+    path: "images/logos"
+    filename: "logo.png"
+    alt: "LocalAI Logo"
+    height: 340px
+
+  title: ""
+  subtitle: |
+    **The free, OpenAI, Anthropic alternative. Your All-in-One Complete AI Stack** - Run powerful language models, autonomous agents, and document intelligence **locally** on your hardware. 
+    
+    **No cloud, no limits, no compromise.**
+
+  image:
+    path: "images"
+    filename: "localai_screenshot.png"
+    alt: "LocalAI Screenshot"
+    boxShadow: true
+    rounded: true
+
+  ctaButton:
+    icon: rocket_launch
+    btnText: "Get Started"
+    url: "/basics/getting_started/"
+  cta2Button:
+    icon: code
+    btnText: "View on GitHub"
+    url: "https://github.com/mudler/LocalAI"
+
+  info: |
+    **Drop-in replacement for OpenAI API** - modular suite of tools that work seamlessly together or independently. 
+    
+    Start with **[LocalAI](https://localai.io)**'s OpenAI-compatible API, extend with **[LocalAGI](https://github.com/mudler/LocalAGI)**'s autonomous agents, and enhance with **[LocalRecall](https://github.com/mudler/LocalRecall)**'s semantic search - all running locally on your hardware.
+
+    **Open Source** MIT Licensed.
+
 # Feature Grid
 featureGrid:
-  enable: false
+  enable: true
  weight: 20
  template: feature grid

+  title: Why choose LocalAI?
+  subtitle: |
+    **OpenAI API Compatible** - Run AI models locally with our modular ecosystem. From language models to autonomous agents and semantic search, build your complete AI stack without the cloud.
+
+  items:
+    - title: LLM Inferencing
+      icon: memory_alt
+      description: LocalAI is a free, **Open Source** OpenAI alternative. Run **LLMs**, generate **images**, **audio** and more **locally** with consumer grade hardware.
+      ctaLink:
+        text: learn more
+        url: /basics/getting_started/
+    - title: Agentic-first
+      icon: smart_toy
+      description: |
+        Extend LocalAI with LocalAGI, an autonomous AI agent platform that runs locally, no coding required. 
+        Build and deploy autonomous agents with ease. Interact with REST APIs or use the WebUI.
+      ctaLink:
+        text: learn more
+        url: https://github.com/mudler/LocalAGI
+
+    - title: Memory and Knowledge base
+      icon: psychology
+      description: 
+        Extend LocalAI with LocalRecall, A local rest api for semantic search and memory management. Perfect for AI applications.
+      ctaLink:
+        text: learn more
+        url: https://github.com/mudler/LocalRecall
+
+    - title: OpenAI Compatible
+      icon: api
+      description: Drop-in replacement for OpenAI API. Compatible with existing applications and libraries.
+      ctaLink:
+        text: learn more
+        url: /basics/getting_started/
+
+    - title: No GPU Required
+      icon: memory
+      description: Run on consumer grade hardware. No need for expensive GPUs or cloud services.
+      ctaLink:
+        text: learn more
+        url: /basics/getting_started/
+
+    - title: Multiple Models
+      icon: hub
+      description: |
+          Support for various model families including LLMs, image generation, and audio models.
+          Supports multiple backends for inferencing, including vLLM, llama.cpp, and more.
+          You can switch between them as needed and install them from the Web interface or the CLI.
+      ctaLink:
+        text: learn more
+        url: /model-compatibility
+
+    - title: Privacy Focused
+      icon: security
+      description: Keep your data local. No data leaves your machine, ensuring complete privacy.
+      ctaLink:
+        text: learn more
+        url: /basics/container/
+
+    - title: Easy Setup
+      icon: settings
+      description: Simple installation and configuration. Get started in minutes with Binaries installation, Docker, Podman, Kubernetes or local installation.
+      ctaLink:
+        text: learn more
+        url: /basics/getting_started/
+
+    - title: Community Driven
+      icon: groups
+      description: Active community support and regular updates. Contribute and help shape the future of LocalAI.
+      ctaLink:
+        text: learn more
+        url: https://github.com/mudler/LocalAI
+
+
+
+    - title: Extensible
+      icon: extension
+      description: Easy to extend and customize. Add new models and features as needed.
+      ctaLink:
+        text: learn more
+        url: /docs/integrations/
+
+    - title: Peer 2 Peer
+      icon: hub
+      description: |
+        LocalAI is designed to be a decentralized LLM inference, powered by a peer-to-peer system based on libp2p. 
+        It is designed to be used in a local or remote network, and is compatible with any LLM model. 
+        It works both in federated mode or by splitting models weights.
+      ctaLink:
+        text: learn more
+        url: /features/distribute/
+
+    - title: Open Source
+      icon: code
+      description: MIT licensed. Free to use, modify, and distribute. Community contributions welcome.
+      ctaLink:
+        text: learn more
+        url: https://github.com/mudler/LocalAI
+
 imageText:
  enable: true
  weight: 25
  template: image text

-  title: LocalAI
-  subtitle: The Free, Open Source OpenAI Alternative
+  title: Run AI models locally with ease
+  subtitle: |
+    LocalAI makes it simple to run various AI models on your own hardware. From text generation to image creation, autonomous agents to semantic search - all orchestrated through a unified API.

  list:
-    - text: Optimized, fast inference
-      icon: speed
+    - text: OpenAI API compatibility
+      icon: api

-    - text: Comprensive support for many models architectures
-      icon: area_chart
+    - text: Multiple model support
+      icon: hub

-    - text: Easy to deploy with Docker
-      icon: accessibility
+    - text: Image understanding
+      icon: image
+    
+    - text: Image generation
+      icon: image
+
+    - text: Audio generation
+      icon: music_note
+
+    - text: Voice activity detection
+      icon: mic
+
+    - text: Speech recognition
+      icon: mic
+
+    - text: Video generation
+      icon: movie
+
+    - text: Privacy focused
+      icon: security
+
+    - text: Autonomous agents with [LocalAGI](https://github.com/mudler/LocalAGI)
+      icon: smart_toy
+
+    - text: Semantic search with [LocalRecall](https://github.com/mudler/LocalRecall)
+      icon: psychology
+
+    - text: Agent orchestration
+      icon: hub

  image:
-    path: "images/logos"
-    filename: "logo.png"
-    alt: "LocalAI logo" # Optional but recommended
+    path: "images"
+    filename: "imagen.png"
+    alt: "LocalAI Image generation"

  imgOrder:
    desktop: 2
@@ -41,10 +215,62 @@ imageText:

  ctaButton:
    text: Learn more
-    url: "/docs/"
+    url: "/basics/getting_started/"

 # Image compare
 imageCompare:
  enable: false
  weight: 30
  template: image compare
+
+  title: LocalAI in Action
+  subtitle: See how LocalAI can transform your local AI experience with various models and capabilities.
+
+  items:
+    - title: Text Generation
+      config: {
+        startingPoint: 50,
+        addCircle: true,
+        addCircleBlur: false,
+        showLabels: true,
+        labelOptions: {
+          before: 'Dark',
+          after: 'Light',
+          onHover: false
+        }
+      }
+      imagePath: "images/screenshots"
+      imageBefore: "text_generation_input.webp"
+      imageAfter: "text_generation_output.webp"
+
+    - title: Image Generation
+      config: {
+        startingPoint: 50,
+        addCircle: true,
+        addCircleBlur: true,
+        showLabels: true,
+        labelOptions: {
+          before: 'Prompt',
+          after: 'Result',
+          onHover: true
+        }
+      }
+      imagePath: "images/screenshots"
+      imageBefore: "imagen_before.webp"
+      imageAfter: "imagen_after.webp"
+
+    - title: Audio Generation
+      config: {
+        startingPoint: 50,
+        addCircle: true,
+        addCircleBlur: false,
+        showLabels: true,
+        labelOptions: {
+          before: 'Text',
+          after: 'Audio',
+          onHover: false
+        }
+      }
+      imagePath: "images/screenshots"
+      imageBefore: "audio_generation_text.webp"
+      imageAfter: "audio_generation_waveform.webp"
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.26.0"
+  "version": "v2.28.0"
 }
--- a/docs/layouts/index.html
+++ b/docs/layouts/index.html
--- a/docs/layouts/partials/docs/top-header.html
+++ b/docs/layouts/partials/docs/top-header.html
@@ -82,7 +82,7 @@
                </span>
            </button>
            {{ end -}}
-            {{ if .Site.IsMultiLingual }}
+            {{ if hugo.IsMultilingual }}
                <div class="dropdown">
                    <button class="btn btn-link btn-default dropdown-toggle ps-2" type="button" data-bs-toggle="dropdown" aria-expanded="false">
                        {{ site.Language.Lang | upper }}
--- a/docs/layouts/partials/head.html
+++ b/docs/layouts/partials/head.html
@@ -18,10 +18,10 @@
    <!-- Custom CSS -->
    {{- $options := dict "enableSourceMap" true }}
    {{- if hugo.IsProduction}}
-        {{- $options := dict "enableSourceMap" false "outputStyle" "compressed" }}
+        {{- $options = dict "enableSourceMap" false "outputStyle" "compressed" }}
    {{- end }}
    {{- $style := resources.Get "/scss/style.scss" }}
-    {{- $style = $style | resources.ExecuteAsTemplate "/scss/style.scss" . | resources.ToCSS $options }}
+    {{- $style = $style | resources.ExecuteAsTemplate "/scss/style.scss" . | css.Sass $options }}
    {{- if hugo.IsProduction }}
        {{- $style = $style | minify | fingerprint "sha384" }}
    {{- end -}}
@@ -39,7 +39,7 @@
    <!-- Image Compare Viewer -->
    {{ if ($.Scratch.Get "image_compare_enabled") }}
        {{ $imagecompare := resources.Get "js/image-compare-viewer.min.js" }}
-        {{- if not .Site.IsServer }}
+        {{- if not hugo.IsDevelopment }}
            {{- $js := (slice $imagecompare) | resources.Concat "/js/image-compare.js" | minify | fingerprint "sha384" }}
            <script type="text/javascript" src="{{ $js.Permalink }}" integrity="{{ $js.Data.Integrity }}"></script>
        {{- else }}
@@ -48,14 +48,14 @@
        {{- end }}
    {{- end }}
    <!-- Plausible Analytics Config -->
-    {{- if not .Site.IsServer }}
+    {{- if not hugo.IsDevelopment }}
    {{ if and (.Site.Params.plausible.scriptURL) (.Site.Params.plausible.dataDomain) -}}
        {{- partialCached "head/plausible" . }}
    {{- end -}}
    {{- end -}}
    <!-- Google Analytics v4 Config -->
-    {{- if not .Site.IsServer }}
-    {{- if .Site.GoogleAnalytics }}
+    {{- if not hugo.IsDevelopment }}
+    {{- if .Site.Params.analytics.google }}
        {{- template "_internal/google_analytics.html" . -}}
    {{- end -}}
    {{- end -}}
--- a/docs/layouts/partials/header.html
+++ b/docs/layouts/partials/header.html
@@ -0,0 +1,57 @@
+<!-- Navbar Start -->
+<header id="topnav">
+    <div class="container d-flex justify-content-between align-items-center">
+        <!-- Logo container-->
+        <a class="logo" aria-label="Home" href='{{ relLangURL "" }}'>
+            
+        </a>
+        <!-- End Logo container-->
+
+        <div class="d-flex align-items-center">
+
+            <div id="navigation">
+                <!-- Navigation Menu -->
+                <ul class="navigation-menu nav-right">
+                    {{- range .Site.Menus.primary }}
+                    <li><a href="{{ relLangURL .URL }}">{{ .Name }}</a></li>
+                    {{ end }}
+                </ul><!--end navigation menu-->
+            </div><!--end navigation-->
+
+            <!-- Social Links Start -->
+            {{ with $.Scratch.Get "social_list" }}
+            <ul class="social-link d-flex list-inline mb-0">
+                {{ range . }}
+                    {{ $path := printf "images/social/%s.%s" . "svg" }}
+                    <li class="list-inline-item mb-0">
+                        <a href="{{ if eq . `rss` }} {{ `index.xml` | absURL }} {{ else if eq . `bluesky` }} https://bsky.app/profile/{{ index site.Params.social . }} {{ else }} https://{{ . }}.com/{{ index site.Params.social . }} {{ end }}" alt="{{ . }}" rel="noopener noreferrer" target="_blank">
+                            <div class="btn btn-icon btn-landing border-0">
+                                {{ with resources.Get $path }}
+                                    {{ .Content | safeHTML }}
+                                {{ end }}
+                            </div>
+                        </a>
+                    </li>
+                {{ end }}
+            </ul>
+            {{ end }}
+            <!-- Social Links End -->
+
+            <div class="menu-extras ms-3 me-2">
+                <div class="menu-item">
+                    <!-- Mobile menu toggle-->
+                    <button class="navbar-toggle btn btn-icon btn-soft-light" id="isToggle" aria-label="toggleMenu" onclick="toggleMenu()">
+                        <div class="lines">
+                            <span></span>
+                            <span></span>
+                            <span></span>
+                        </div>
+                    </button>
+                    <!-- End mobile menu toggle-->
+                </div>
+            </div>
+
+        </div>
+    </div><!--end container-->
+</header><!--end header-->
+<!-- Navbar End -->
--- a/docs/layouts/partials/logo.html
+++ b/docs/layouts/partials/logo.html
@@ -1 +1 @@
-<a href="https://localai.io"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"></a>
+<a href="https://localai.io"><img src="https://raw.githubusercontent.com/mudler/LocalAI/refs/heads/master/core/http/static/logo.png"></a>
--- a/docs/netlify.toml
+++ b/docs/netlify.toml
@@ -1,4 +1,4 @@
 [build]
 [build.environment]
-HUGO_VERSION = "0.121.2"
+HUGO_VERSION = "0.146.3"
 GO_VERSION = "1.22.2"
--- a/Show More
+++ b/Show More